From cc45a76609a2a870f157121821962525efac9fed Mon Sep 17 00:00:00 2001
From: mmorale3 <miguel.mmorales@gmail.com>
Date: Wed, 16 Sep 2020 09:39:44 -0700
Subject: [PATCH] fixed issues on gpu with recent noncollinear changes, all
 unit tests pass non-collinear KP input

---
 .../boost_multi/multi/memory/allocator.hpp    |   3 +-
 .../KP3IndexFactorization_batched.hpp         | 147 ++++++++++++------
 .../Hamiltonians/KPFactorizedHamiltonian.cpp  |   4 +-
 .../detail/CUDA/Kernels/KaKjw_to_KKwaj.cu     |  58 ++++---
 .../detail/CUDA/Kernels/KaKjw_to_KKwaj.cuh    |   6 +
 .../detail/CUDA/Kernels/KaKjw_to_QKajw.cu     |  52 ++++---
 .../detail/CUDA/Kernels/KaKjw_to_QKajw.cuh    |   6 +
 .../detail/HIP/Kernels/KaKjw_to_KKwaj.hip.cpp |  56 ++++---
 .../detail/HIP/Kernels/KaKjw_to_KKwaj.hip.h   |   6 +
 .../detail/HIP/Kernels/KaKjw_to_QKajw.hip.cpp |  52 ++++---
 .../detail/HIP/Kernels/KaKjw_to_QKajw.hip.h   |   6 +
 src/AFQMC/Numerics/tensor_operations.hpp      |  44 ++++--
 .../Numerics/tests/test_tensor_operations.cpp |   4 +-
 .../SlaterDetOperations_serial.hpp            |   2 +-
 .../apply_expM.hpp                            |  84 ++++++++++
 .../SlaterDeterminantOperations/rotate.hpp    |   2 +-
 16 files changed, 374 insertions(+), 158 deletions(-)
diff --git a/external_codes/boost_multi/multi/memory/allocator.hpp b/external_codes/boost_multi/multi/memory/allocator.hpp
index d10f9d86f..2e93733cb 100644
--- a/external_codes/boost_multi/multi/memory/allocator.hpp
+++ b/external_codes/boost_multi/multi/memory/allocator.hpp
@@ -49,7 +49,8 @@ public:
 	bool operator!=(allocator const& o) const{return mp_ != o.mp_;}
 	using void_pointer = typename std::pointer_traits<decltype(std::declval<memory_type*>()->allocate(0, 0))>::template rebind<void>;
 	pointer allocate(size_type n){
-		return static_cast<pointer>(static_cast<void_pointer>(mp_->allocate(n*sizeof(value_type), alignof(T))));
+		return static_cast<pointer>(static_cast<void_pointer>(mp_->allocate(n*sizeof(value_type), 16)));
+		//return static_cast<pointer>(static_cast<void_pointer>(mp_->allocate(n*sizeof(value_type), alignof(T))));
 	}
 	void deallocate(pointer p, size_type n){
 		mp_->deallocate(p, n*sizeof(value_type));
diff --git a/src/AFQMC/HamiltonianOperations/KP3IndexFactorization_batched.hpp b/src/AFQMC/HamiltonianOperations/KP3IndexFactorization_batched.hpp
index 270071a36..5d20c4178 100644
--- a/src/AFQMC/HamiltonianOperations/KP3IndexFactorization_batched.hpp
+++ b/src/AFQMC/HamiltonianOperations/KP3IndexFactorization_batched.hpp
@@ -82,6 +82,8 @@ class KP3IndexFactorization_batched
   using CMatrix_cref  = boost::multi::array_ref<ComplexType const, 2, const_pointer>;
   using CVector_ref   = ComplexVector_ref<pointer>;
   using CMatrix_ref   = ComplexMatrix_ref<pointer>;
+  using C3Tensor_ref = Complex3Tensor_ref<pointer>; 
+  using C4Tensor_ref = ComplexArray_ref<4, pointer>;
   using C3Tensor_cref = boost::multi::array_ref<ComplexType const, 3, const_pointer>;
 
   using SpMatrix_cref = boost::multi::array_ref<SPComplexType const, 2, sp_pointer>;
@@ -338,50 +340,96 @@ public:
   {
     int nkpts = nopk.size();
     int NMO   = std::accumulate(nopk.begin(), nopk.end(), 0);
-    // in non-collinear case with SO, keep SO matrix here and add it
-    // for now, stay collinear
+    int npol = (walker_type == NONCOLLINEAR) ? 2 : 1;
 
     CVector vMF_(vMF);
-    CVector P1D(iextensions<1u>{NMO * NMO});
-    fill_n(P1D.origin(), P1D.num_elements(), ComplexType(0));
-    vHS(vMF_, P1D);
+    CVector P0D(iextensions<1u>{NMO * NMO});
+    fill_n(P0D.origin(), P0D.num_elements(), ComplexType(0));
+    vHS(vMF_, P0D);
     if (TG_.TG().size() > 1)
-      TG_.TG().all_reduce_in_place_n(to_address(P1D.origin()), P1D.num_elements(), std::plus<>());
+      TG_.TG().all_reduce_in_place_n(to_address(P0D.origin()), P0D.num_elements(), std::plus<>());
 
-    boost::multi::array<ComplexType, 2> P1({NMO, NMO});
-    copy_n(P1D.origin(), NMO * NMO, P1.origin());
+    boost::multi::array<ComplexType, 2> P0({NMO, NMO});
+    copy_n(P0D.origin(), NMO * NMO, P0.origin());
 
-    // add H1 + vn0 and symmetrize
-    using ma::conj;
+    boost::multi::array<ComplexType, 2> P1({npol*NMO, npol*NMO});
+    std::fill_n(P1.origin(), P1.num_elements(), ComplexType(0.0));
 
+    // add spin-dependent H1
+    for (int K = 0, nk0 = 0; K < nkpts; ++K)
+    { 
+      for (int i = 0, I = nk0; i < nopk[K]; i++, I++)
+      { 
+        for(int p=0; p<npol; ++p) 
+          P1[p*NMO+I][p*NMO+I] += H1[K][p*nopk[K]+i][p*nopk[K]+i];
+        for (int j = i + 1, J = I + 1; j < nopk[K]; j++, J++)
+        { 
+          for(int p=0; p<npol; ++p)
+          {   
+              P1[p*NMO+I][p*NMO+J] += H1[K][p*nopk[K]+i][p*nopk[K]+j];
+              P1[p*NMO+J][p*NMO+I] += H1[K][p*nopk[K]+j][p*nopk[K]+i];
+          }
+        }
+        if(walker_type == NONCOLLINEAR) {
+          // offdiagonal piece
+          for (int j = 0, J = nk0; j < nopk[K]; j++, J++)
+          { 
+            P1[I][NMO+J] += H1[K][i][nopk[K]+j];
+            P1[NMO+J][I] += H1[K][nopk[K]+j][i];
+          }
+        }
+      }
+      nk0 += nopk[K];
+    }
+
+    // add P0 (diagonal in spin)
+    for(int p=0; p<npol; ++p)
+      for(int I=0; I<NMO; I++)
+        for(int J=0; J<NMO; J++)
+          P1[p*NMO+I][p*NMO+J] += P0[I][J];
+
+    // add vn0 (diagonal in spin)
     for (int K = 0, nk0 = 0; K < nkpts; ++K)
     {
       for (int i = 0, I = nk0; i < nopk[K]; i++, I++)
       {
-        P1[I][I] += H1[K][i][i] + vn0[K][i][i];
+        for(int p=0; p<npol; ++p)
+          P1[p*NMO+I][p*NMO+I] += vn0[K][i][i];
         for (int j = i + 1, J = I + 1; j < nopk[K]; j++, J++)
         {
-          P1[I][J] += H1[K][i][j] + vn0[K][i][j];
-          P1[J][I] += H1[K][j][i] + vn0[K][j][i];
-          // This is really cutoff dependent!!!
-#if MIXED_PRECISION
-          if (std::abs(P1[I][J] - ma::conj(P1[J][I])) * 2.0 > 1e-5)
+          for(int p=0; p<npol; ++p)
           {
-#else
-          if (std::abs(P1[I][J] - ma::conj(P1[J][I])) * 2.0 > 1e-6)
-          {
-#endif
-            app_error() << " WARNING in getOneBodyPropagatorMatrix. H1 is not hermitian. \n";
-            app_error() << I << " " << J << " " << P1[I][J] << " " << P1[J][I] << " " << H1[K][i][j] << " "
-                        << H1[K][j][i] << " " << vn0[K][i][j] << " " << vn0[K][j][i] << std::endl;
-            //APP_ABORT("Error in getOneBodyPropagatorMatrix. H1 is not hermitian. \n");
+            P1[p*NMO+I][p*NMO+J] += vn0[K][i][j];
+            P1[p*NMO+J][p*NMO+I] += vn0[K][j][i];
           }
-          P1[I][J] = 0.5 * (P1[I][J] + ma::conj(P1[J][I]));
-          P1[J][I] = ma::conj(P1[I][J]);
         }
       }
       nk0 += nopk[K];
     }
+
+    using ma::conj;
+    // symmetrize
+    for(int I=0; I<npol*NMO; I++)
+    {
+      for(int J=I+1; J<npol*NMO; J++)
+      {
+        // This is really cutoff dependent!!!
+#if MIXED_PRECISION
+        if (std::abs(P1[I][J] - ma::conj(P1[J][I])) * 2.0 > 1e-5)
+        {
+#else
+        if (std::abs(P1[I][J] - ma::conj(P1[J][I])) * 2.0 > 1e-6)
+        {
+#endif
+          app_error() << " WARNING in getOneBodyPropagatorMatrix. H1 is not hermitian. \n";
+          app_error() << I << " " << J << " " << P1[I][J] << " " << P1[J][I] <<std::endl;
+                      //<< H1[K][i][j] << " "
+                      //<< H1[K][j][i] << " " << vn0[K][i][j] << " " << vn0[K][j][i] << std::endl;
+        }
+        P1[I][J] = 0.5 * (P1[I][J] + ma::conj(P1[J][I]));
+        P1[J][I] = ma::conj(P1[I][J]);
+      }
+    }
     return P1;
   }
 
@@ -438,6 +486,7 @@ public:
 
     int nwalk     = Gc.size(1);
     int nspin     = (walker_type == COLLINEAR ? 2 : 1);
+    int npol      = (walker_type == NONCOLLINEAR ? 2 : 1);
     int nmo_tot   = std::accumulate(nopk.begin(), nopk.end(), 0);
     int nmo_max   = *std::max_element(nopk.begin(), nopk.end());
     int nocca_tot = std::accumulate(nelpk[nd].begin(), nelpk[nd].begin() + nkpts, 0);
@@ -495,13 +544,13 @@ public:
     for (int n = 0; n < nwalk; n++)
       fill_n(E[n].origin(), 3, ComplexType(0.));
 
-    assert(Gc.num_elements() == nwalk * (nocca_tot + noccb_tot) * nmo_tot);
-    C3Tensor_cref G3Da(make_device_ptr(Gc.origin()), {nocca_tot, nmo_tot, nwalk});
+    assert(Gc.num_elements() == nwalk * (nocca_tot + noccb_tot) * npol * nmo_tot);
+    C3Tensor_cref G3Da(make_device_ptr(Gc.origin()), {nocca_tot*npol, nmo_tot, nwalk});
     C3Tensor_cref G3Db(make_device_ptr(Gc.origin()) + G3Da.num_elements() * (nspin - 1), {noccb_tot, nmo_tot, nwalk});
 
     // later on, rewrite routine to loop over spins, to avoid storage of both spin
     // components simultaneously
-    Static4Tensor GKK({nspin, nkpts, nkpts, nwalk * nmo_max * nocc_max},
+    Static4Tensor GKK({nspin, nkpts, nkpts, nwalk  * npol * nmo_max * nocc_max},
                       device_buffer_allocator->template get_allocator<SPComplexType>());
     GKaKjw_to_GKKwaj(G3Da, GKK[0], nelpk[nd].sliced(0, nkpts), dev_nelpk[nd], dev_a0pk[nd]);
     if (walker_type == COLLINEAR)
@@ -519,10 +568,11 @@ public:
       for (int K = 0; K < nkpts; ++K)
       {
 #if defined(MIXED_PRECISION)
-        CMatrix_ref haj_K(make_device_ptr(haj[nd * nkpts + K].origin()), {nocc_max, nmo_max});
+        C3Tensor_ref haj_K(make_device_ptr(haj[nd * nkpts + K].origin()), {nocc_max, npol, nmo_max});
         for (int a = 0; a < nelpk[nd][K]; ++a)
-          ma::product(ComplexType(1.), ma::T(G3Da[na + a].sliced(nk, nk + nopk[K])), haj_K[a].sliced(0, nopk[K]),
-                      ComplexType(1.), E({0, nwalk}, 0));
+          for (int pol = 0; pol < npol; ++pol)
+            ma::product(ComplexType(1.), ma::T(G3Da[(na + a)*npol+p].sliced(nk, nk + nopk[K])), 
+                        haj_K[a][pol].sliced(0, nopk[K]), ComplexType(1.), E({0, nwalk}, 0));
         na += nelpk[nd][K];
         if (walker_type == COLLINEAR)
         {
@@ -538,8 +588,8 @@ public:
         nk = nopk[K];
         {
           na = nelpk[nd][K];
-          CVector_ref haj_K(make_device_ptr(haj[nd * nkpts + K].origin()), {nocc_max * nmo_max});
-          SpMatrix_ref Gaj(GKK[0][K][K].origin(), {nwalk, nocc_max * nmo_max});
+          CVector_ref haj_K(make_device_ptr(haj[nd * nkpts + K].origin()), {nocc_max * npol * nmo_max});
+          SpMatrix_ref Gaj(GKK[0][K][K].origin(), {nwalk, nocc_max * npol * nmo_max});
           ma::product(ComplexType(1.), Gaj, haj_K, ComplexType(1.), E({0, nwalk}, 0));
         }
         if (walker_type == COLLINEAR)
@@ -582,7 +632,7 @@ public:
       // I WANT C++17!!!!!!
       long mem_ank(0);
       if (needs_copy)
-        mem_ank = nkpts * nocc_max * nchol_max * nmo_max;
+        mem_ank = nkpts * nocc_max * nchol_max * npol * nmo_max;
       StaticVector LBuff(iextensions<1u>{2 * mem_ank},
                          device_buffer_allocator->template get_allocator<SPComplexType>());
       sp_pointer LQptr(nullptr), LQmptr(nullptr);
@@ -660,8 +710,8 @@ public:
 
               if (batch_cnt >= batch_size)
               {
-                gemmBatched('T', 'N', nocc_max * nchol_max, nwalk * nocc_max, nmo_max, SPComplexType(1.0),
-                            Aarray.data(), nmo_max, Barray.data(), nmo_max, SPComplexType(0.0), Carray.data(),
+                gemmBatched('T', 'N', nocc_max * nchol_max, nwalk * nocc_max, npol*nmo_max, SPComplexType(1.0),
+                            Aarray.data(), npol*nmo_max, Barray.data(), npol*nmo_max, SPComplexType(0.0), Carray.data(),
                             nocc_max * nchol_max, Aarray.size());
 
                 copy_n(scl_factors.data(), scl_factors.size(), dev_scl_factors.origin());
@@ -691,8 +741,8 @@ public:
 
           if (batch_cnt > 0)
           {
-            gemmBatched('T', 'N', nocc_max * nchol_max, nwalk * nocc_max, nmo_max, SPComplexType(1.0), Aarray.data(),
-                        nmo_max, Barray.data(), nmo_max, SPComplexType(0.0), Carray.data(), nocc_max * nchol_max,
+            gemmBatched('T', 'N', nocc_max * nchol_max, nwalk * nocc_max, npol*nmo_max, SPComplexType(1.0), Aarray.data(),
+                        npol*nmo_max, Barray.data(), npol*nmo_max, SPComplexType(0.0), Carray.data(), nocc_max * nchol_max,
                         Aarray.size());
 
             copy_n(scl_factors.data(), scl_factors.size(), dev_scl_factors.origin());
@@ -1322,6 +1372,7 @@ public:
     assert(v.size(0) == 2 * local_nCV);
     assert(v.size(1) == nwalk);
     int nspin     = (walker_type == COLLINEAR ? 2 : 1);
+    int npol      = (walker_type == NONCOLLINEAR ? 2 : 1);
     int nmo_tot   = std::accumulate(nopk.begin(), nopk.end(), 0);
     int nmo_max   = *std::max_element(nopk.begin(), nopk.end());
     int nocca_tot = std::accumulate(nelpk[nd].begin(), nelpk[nd].begin() + nkpts, 0);
@@ -1340,11 +1391,11 @@ public:
     SPComplexType minusimhalfa(0.0, -0.5 * a * scl);
     SPComplexType imhalfa(0.0, 0.5 * a * scl);
 
-    assert(G.num_elements() == nwalk * (nocca_tot + noccb_tot) * nmo_tot);
+    assert(G.num_elements() == nwalk * (nocca_tot + noccb_tot) * npol * nmo_tot);
     // MAM: use reshape when available, then no need to deal with types
     using GType = typename std::decay<MatA>::type::element;
     boost::multi::array_ref<GType const, 3, decltype(make_device_ptr(G.origin()))> G3Da(make_device_ptr(G.origin()),
-                                                                                        {nocca_tot, nmo_tot, nwalk});
+                                                                                        {nocca_tot*npol, nmo_tot, nwalk});
     boost::multi::array_ref<GType const, 3, decltype(make_device_ptr(G.origin()))> G3Db(make_device_ptr(G.origin()) +
                                                                                             G3Da.num_elements() *
                                                                                                 (nspin - 1),
@@ -1358,7 +1409,7 @@ public:
       size_t cnt(0);
       Static3Tensor v1({nkpts + number_of_symmetric_Q, nchol_max, nwalk},
                        device_buffer_allocator->template get_allocator<SPComplexType>());
-      Static3Tensor GQ({nkpts, nkpts * nocc_max * nmo_max, nwalk},
+      Static3Tensor GQ({nkpts, nkpts * nocc_max * npol * nmo_max, nwalk},
                        device_buffer_allocator->template get_allocator<SPComplexType>());
       fill_n(v1.origin(), v1.num_elements(), SPComplexType(0.0));
       fill_n(GQ.origin(), GQ.num_elements(), SPComplexType(0.0));
@@ -1370,7 +1421,7 @@ public:
                          dev_a0pk[nd].sliced(nkpts, 2 * nkpts));
 
       // can use productStridedBatched if LQKakn is changed to a 3Tensor array
-      int Kak = nkpts * nocc_max * nmo_max;
+      int Kak = nkpts * nocc_max * npol * nmo_max;
       std::vector<sp_pointer> Aarray;
       std::vector<sp_pointer> Barray;
       std::vector<sp_pointer> Carray;
@@ -1530,30 +1581,32 @@ private:
   template<class MatA, class MatB, class IVec, class IVec2>
   void GKaKjw_to_GKKwaj(MatA const& GKaKj, MatB&& GKKaj, IVec&& nocc, IVec2&& dev_no, IVec2&& dev_a0)
   {
+    int npol = (walker_type == NONCOLLINEAR) ? 2 : 1;
     int nmo_max = *std::max_element(nopk.begin(), nopk.end());
     //      int nocc_max = *std::max_element(nocc.begin(),nocc.end());
     int nmo_tot = GKaKj.size(1);
     int nwalk   = GKaKj.size(2);
     int nkpts   = nopk.size();
-    assert(GKKaj.num_elements() >= nkpts * nkpts * nwalk * nocc_max * nmo_max);
+    assert(GKKaj.num_elements() >= nkpts * nkpts * nwalk * nocc_max * npol * nmo_max);
 
     using ma::KaKjw_to_KKwaj;
-    KaKjw_to_KKwaj(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, dev_nopk.origin(), dev_i0pk.origin(), dev_no.origin(),
+    KaKjw_to_KKwaj(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, dev_nopk.origin(), dev_i0pk.origin(), dev_no.origin(),
                    dev_a0.origin(), GKaKj.origin(), GKKaj.origin());
   }
 
   template<class MatA, class MatB, class IVec, class IVec2>
   void GKaKjw_to_GQKajw(MatA const& GKaKj, MatB&& GQKaj, IVec&& nocc, IVec2&& dev_no, IVec2&& dev_a0)
   {
+    int npol = (walker_type == NONCOLLINEAR) ? 2 : 1;
     int nmo_max = *std::max_element(nopk.begin(), nopk.end());
     //      int nocc_max = *std::max_element(nocc.begin(),nocc.end());
     int nmo_tot = GKaKj.size(1);
     int nwalk   = GKaKj.size(2);
     int nkpts   = nopk.size();
-    assert(GQKaj.num_elements() >= nkpts * nkpts * nwalk * nocc_max * nmo_max);
+    assert(GQKaj.num_elements() >= nkpts * nkpts * nwalk * nocc_max * npol * nmo_max);
 
     using ma::KaKjw_to_QKajw;
-    KaKjw_to_QKajw(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, dev_nopk.origin(), dev_i0pk.origin(), dev_no.origin(),
+    KaKjw_to_QKajw(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, dev_nopk.origin(), dev_i0pk.origin(), dev_no.origin(),
                    dev_a0.origin(), dev_QKToK2.origin(), GKaKj.origin(), GQKaj.origin());
   }
 
diff --git a/src/AFQMC/Hamiltonians/KPFactorizedHamiltonian.cpp b/src/AFQMC/Hamiltonians/KPFactorizedHamiltonian.cpp
index 27b9a7ee7..ff281cd79 100644
--- a/src/AFQMC/Hamiltonians/KPFactorizedHamiltonian.cpp
+++ b/src/AFQMC/Hamiltonians/KPFactorizedHamiltonian.cpp
@@ -1143,7 +1143,7 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_batched(
   for (int nd = 0; nd < ndet; nd++)
   {
     for (int Q = 0; Q < number_of_symmetric_Q; Q++)
-      LQKbnl.emplace_back(shmSpMatrix({nkpts, ank_max}, shared_allocator<SPComplexType>{distNode}));
+      LQKbnl.emplace_back(shmSpMatrix({nkpts, npol*ank_max}, shared_allocator<SPComplexType>{distNode}));
     if (type == COLLINEAR)
     {
       for (int Q = 0; Q < number_of_symmetric_Q; Q++)
@@ -1161,7 +1161,7 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_batched(
   {
     for (int Q = 0; Q < number_of_symmetric_Q; Q++)
     {
-      LQKbln.emplace_back(shmSpMatrix({nkpts, ank_max}, shared_allocator<SPComplexType>{distNode}));
+      LQKbln.emplace_back(shmSpMatrix({nkpts, npol*ank_max}, shared_allocator<SPComplexType>{distNode}));
     }
     if (type == COLLINEAR)
     {
diff --git a/src/AFQMC/Numerics/detail/CUDA/Kernels/KaKjw_to_KKwaj.cu b/src/AFQMC/Numerics/detail/CUDA/Kernels/KaKjw_to_KKwaj.cu
index acdc06c65..45c56c7c9 100644
--- a/src/AFQMC/Numerics/detail/CUDA/Kernels/KaKjw_to_KKwaj.cu
+++ b/src/AFQMC/Numerics/detail/CUDA/Kernels/KaKjw_to_KKwaj.cu
@@ -24,11 +24,12 @@
 namespace kernels
 {
 // very sloppy, needs improvement!!!!
-// A[nocc_tot][nmo_tot][nwalk]
-// B[Ka][Kj][nwalk][nocc_max][nmo_max]
+// A[nocc_tot][s][nmo_tot][nwalk]
+// B[Ka][Kj][nwalk][nocc_max][s][nmo_max]
 template<typename T, typename T2>
 __global__ void kernel_KaKjw_to_KKwaj(int nwalk,
                                       int nkpts,
+                                      int npol,
                                       int nmo_max,
                                       int nmo_tot,
                                       int nocc_max,
@@ -41,30 +42,32 @@ __global__ void kernel_KaKjw_to_KKwaj(int nwalk,
 {
   int Ka = blockIdx.x;
   int Kj = blockIdx.y;
-  if (Ka >= nkpts || Kj >= nkpts)
+  int pol = blockIdx.z;
+  if (Ka >= nkpts || Kj >= nkpts || pol >= npol)
     return;
   int na0 = nocc0[Ka];
   int nj0 = nmo0[Kj];
   int na  = nocc[Ka];
   int nj  = nmo[Kj];
 
-  T const* A_(A + (na0 * nmo_tot + nj0) * nwalk);
-  T2* B_(B + ((Ka * nkpts + Kj) * nocc_max) * nmo_max * nwalk);
+  T const* A_(A + (na0 * npol * nmo_tot + nj0) * nwalk);
+  T2* B_(B + ((Ka * nkpts + Kj) * nocc_max) * npol * nmo_max * nwalk);
 
   if (threadIdx.x >= nj)
     return;
   if (threadIdx.y >= nwalk)
     return;
 
-  for (int a = 0, a1 = 0; a < na; a++, a1 += nmo_tot * nwalk)
+  for (int a = 0, a1 = pol * nmo_tot * nwalk; a < na; a++, a1 += npol * nmo_tot * nwalk)
     for (int j = threadIdx.x; j < nj; j += blockDim.x)
       for (int n = threadIdx.y; n < nwalk; n += blockDim.y)
-        B_[n * nocc_max * nmo_max + a * nmo_max + j] = static_cast<T2>(A_[a1 + j * nwalk + n]);
+        B_[((n * nocc_max + a) * npol + pol) * nmo_max + j] = static_cast<T2>(A_[a1 + j * nwalk + n]);
 }
 
 template<typename T, typename T2>
 __global__ void kernel_KaKjw_to_KKwaj(int nwalk,
                                       int nkpts,
+                                      int npol,
                                       int nmo_max,
                                       int nmo_tot,
                                       int nocc_max,
@@ -77,30 +80,32 @@ __global__ void kernel_KaKjw_to_KKwaj(int nwalk,
 {
   int Ka = blockIdx.x;
   int Kj = blockIdx.y;
-  if (Ka >= nkpts || Kj >= nkpts)
+  int pol = blockIdx.z;
+  if (Ka >= nkpts || Kj >= nkpts || pol >= npol)
     return;
   int na0 = nocc0[Ka];
   int nj0 = nmo0[Kj];
   int na  = nocc[Ka];
   int nj  = nmo[Kj];
 
-  thrust::complex<T> const* A_(A + (na0 * nmo_tot + nj0) * nwalk);
-  thrust::complex<T2>* B_(B + ((Ka * nkpts + Kj) * nocc_max) * nmo_max * nwalk);
+  thrust::complex<T> const* A_(A + (na0 * npol * nmo_tot + nj0) * nwalk);
+  thrust::complex<T2>* B_(B + ((Ka * nkpts + Kj) * nocc_max) * npol * nmo_max * nwalk);
 
   if (threadIdx.x >= nj)
     return;
   if (threadIdx.y >= nwalk)
     return;
 
-  for (int a = 0, a1 = 0; a < na; a++, a1 += nmo_tot * nwalk)
+  for (int a = 0, a1 = pol * nmo_tot * nwalk; a < na; a++, a1 += npol * nmo_tot * nwalk)
     for (int j = threadIdx.x; j < nj; j += blockDim.x)
       for (int n = threadIdx.y; n < nwalk; n += blockDim.y)
-        B_[n * nocc_max * nmo_max + a * nmo_max + j] = static_cast<thrust::complex<T2>>(A_[a1 + j * nwalk + n]);
+        B_[((n * nocc_max + a) * npol + pol) * nmo_max + j] = static_cast<thrust::complex<T2>>(A_[a1 + j * nwalk + n]);
 }
 
 
 void KaKjw_to_KKwaj(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -114,8 +119,8 @@ void KaKjw_to_KKwaj(int nwalk,
   int xblock_dim = 16;
   int yblock_dim = std::min(nwalk, 32);
   dim3 block_dim(xblock_dim, yblock_dim, 1);
-  dim3 grid_dim(nkpts, nkpts, 1);
-  kernel_KaKjw_to_KKwaj<<<grid_dim, block_dim>>>(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, A,
+  dim3 grid_dim(nkpts, nkpts, npol);
+  kernel_KaKjw_to_KKwaj<<<grid_dim, block_dim>>>(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, A,
                                                  B);
   qmc_cuda::cuda_check(cudaGetLastError(), "KaKjw_to_KKwaj");
   qmc_cuda::cuda_check(cudaDeviceSynchronize(), "KaKjw_to_KKwaj");
@@ -123,6 +128,7 @@ void KaKjw_to_KKwaj(int nwalk,
 
 void KaKjw_to_KKwaj(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -136,8 +142,8 @@ void KaKjw_to_KKwaj(int nwalk,
   int xblock_dim = 16;
   int yblock_dim = std::min(nwalk, 32);
   dim3 block_dim(xblock_dim, yblock_dim, 1);
-  dim3 grid_dim(nkpts, nkpts, 1);
-  kernel_KaKjw_to_KKwaj<<<grid_dim, block_dim>>>(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, A,
+  dim3 grid_dim(nkpts, nkpts, npol);
+  kernel_KaKjw_to_KKwaj<<<grid_dim, block_dim>>>(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, A,
                                                  B);
   qmc_cuda::cuda_check(cudaGetLastError(), "KaKjw_to_KKwaj");
   qmc_cuda::cuda_check(cudaDeviceSynchronize(), "KaKjw_to_KKwaj");
@@ -145,6 +151,7 @@ void KaKjw_to_KKwaj(int nwalk,
 
 void KaKjw_to_KKwaj(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -158,8 +165,8 @@ void KaKjw_to_KKwaj(int nwalk,
   int xblock_dim = 16;
   int yblock_dim = std::min(nwalk, 32);
   dim3 block_dim(xblock_dim, yblock_dim, 1);
-  dim3 grid_dim(nkpts, nkpts, 1);
-  kernel_KaKjw_to_KKwaj<<<grid_dim, block_dim>>>(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, A,
+  dim3 grid_dim(nkpts, nkpts, npol);
+  kernel_KaKjw_to_KKwaj<<<grid_dim, block_dim>>>(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, A,
                                                  B);
   qmc_cuda::cuda_check(cudaGetLastError(), "KaKjw_to_KKwaj");
   qmc_cuda::cuda_check(cudaDeviceSynchronize(), "KaKjw_to_KKwaj");
@@ -167,6 +174,7 @@ void KaKjw_to_KKwaj(int nwalk,
 
 void KaKjw_to_KKwaj(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -180,8 +188,8 @@ void KaKjw_to_KKwaj(int nwalk,
   int xblock_dim = 16;
   int yblock_dim = std::min(nwalk, 32);
   dim3 block_dim(xblock_dim, yblock_dim, 1);
-  dim3 grid_dim(nkpts, nkpts, 1);
-  kernel_KaKjw_to_KKwaj<<<grid_dim, block_dim>>>(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0,
+  dim3 grid_dim(nkpts, nkpts, npol);
+  kernel_KaKjw_to_KKwaj<<<grid_dim, block_dim>>>(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0,
                                                  reinterpret_cast<thrust::complex<double> const*>(A),
                                                  reinterpret_cast<thrust::complex<double>*>(B));
   qmc_cuda::cuda_check(cudaGetLastError(), "KaKjw_to_KKwaj");
@@ -190,6 +198,7 @@ void KaKjw_to_KKwaj(int nwalk,
 
 void KaKjw_to_KKwaj(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -203,8 +212,8 @@ void KaKjw_to_KKwaj(int nwalk,
   int xblock_dim = 16;
   int yblock_dim = std::min(nwalk, 32);
   dim3 block_dim(xblock_dim, yblock_dim, 1);
-  dim3 grid_dim(nkpts, nkpts, 1);
-  kernel_KaKjw_to_KKwaj<<<grid_dim, block_dim>>>(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0,
+  dim3 grid_dim(nkpts, nkpts, npol);
+  kernel_KaKjw_to_KKwaj<<<grid_dim, block_dim>>>(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0,
                                                  reinterpret_cast<thrust::complex<float> const*>(A),
                                                  reinterpret_cast<thrust::complex<float>*>(B));
   qmc_cuda::cuda_check(cudaGetLastError(), "KaKjw_to_KKwaj");
@@ -213,6 +222,7 @@ void KaKjw_to_KKwaj(int nwalk,
 
 void KaKjw_to_KKwaj(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -226,8 +236,8 @@ void KaKjw_to_KKwaj(int nwalk,
   int xblock_dim = 16;
   int yblock_dim = std::min(nwalk, 32);
   dim3 block_dim(xblock_dim, yblock_dim, 1);
-  dim3 grid_dim(nkpts, nkpts, 1);
-  kernel_KaKjw_to_KKwaj<<<grid_dim, block_dim>>>(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0,
+  dim3 grid_dim(nkpts, nkpts, npol);
+  kernel_KaKjw_to_KKwaj<<<grid_dim, block_dim>>>(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0,
                                                  reinterpret_cast<thrust::complex<double> const*>(A),
                                                  reinterpret_cast<thrust::complex<float>*>(B));
   qmc_cuda::cuda_check(cudaGetLastError(), "KaKjw_to_KKwaj");
diff --git a/src/AFQMC/Numerics/detail/CUDA/Kernels/KaKjw_to_KKwaj.cuh b/src/AFQMC/Numerics/detail/CUDA/Kernels/KaKjw_to_KKwaj.cuh
index 055353083..cb2ee706b 100644
--- a/src/AFQMC/Numerics/detail/CUDA/Kernels/KaKjw_to_KKwaj.cuh
+++ b/src/AFQMC/Numerics/detail/CUDA/Kernels/KaKjw_to_KKwaj.cuh
@@ -23,6 +23,7 @@ namespace kernels
 {
 void KaKjw_to_KKwaj(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -34,6 +35,7 @@ void KaKjw_to_KKwaj(int nwalk,
                     double* B);
 void KaKjw_to_KKwaj(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -45,6 +47,7 @@ void KaKjw_to_KKwaj(int nwalk,
                     float* B);
 void KaKjw_to_KKwaj(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -56,6 +59,7 @@ void KaKjw_to_KKwaj(int nwalk,
                     float* B);
 void KaKjw_to_KKwaj(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -67,6 +71,7 @@ void KaKjw_to_KKwaj(int nwalk,
                     std::complex<double>* B);
 void KaKjw_to_KKwaj(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -78,6 +83,7 @@ void KaKjw_to_KKwaj(int nwalk,
                     std::complex<float>* B);
 void KaKjw_to_KKwaj(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
diff --git a/src/AFQMC/Numerics/detail/CUDA/Kernels/KaKjw_to_QKajw.cu b/src/AFQMC/Numerics/detail/CUDA/Kernels/KaKjw_to_QKajw.cu
index 92f84e167..f6527b0e7 100644
--- a/src/AFQMC/Numerics/detail/CUDA/Kernels/KaKjw_to_QKajw.cu
+++ b/src/AFQMC/Numerics/detail/CUDA/Kernels/KaKjw_to_QKajw.cu
@@ -29,6 +29,7 @@ namespace kernels
 template<typename T, typename T2>
 __global__ void kernel_KaKjw_to_QKajw(int nwalk,
                                       int nkpts,
+                                      int npol,
                                       int nmo_max,
                                       int nmo_tot,
                                       int nocc_max,
@@ -42,7 +43,8 @@ __global__ void kernel_KaKjw_to_QKajw(int nwalk,
 {
   int Q = blockIdx.x;
   int K = blockIdx.y;
-  if (Q >= nkpts || K >= nkpts)
+  int pol = blockIdx.z;
+  if (Q >= nkpts || K >= nkpts || pol > npol)
     return;
   int QK  = QKtok2[Q * nkpts + K];
   int na0 = nocc0[K];
@@ -50,15 +52,16 @@ __global__ void kernel_KaKjw_to_QKajw(int nwalk,
   int na  = nocc[K];
   int nj  = nmo[QK];
 
-  T const* A_(A + (na0 * nmo_tot + nj0) * nwalk);
-  T2* B_(B + ((Q * nkpts + K) * nocc_max) * nmo_max * nwalk);
+  T const* A_(A + (na0 * npol * nmo_tot + nj0) * nwalk);
+  T2* B_(B + ((Q * nkpts + K) * nocc_max) * npol * nmo_max * nwalk);
 
   if (threadIdx.x >= nj)
     return;
   if (threadIdx.y >= nwalk)
     return;
 
-  for (int a = 0, a0 = 0, a1 = 0; a < na; a++, a0 += nmo_max * nwalk, a1 += nmo_tot * nwalk)
+  for (int a = 0, a0 = pol*nmo_max*nwalk, a1 = pol*nmo_tot*nwalk; 
+            a < na; a++, a0 += npol * nmo_max * nwalk, a1 += npol * nmo_tot * nwalk)
     for (int j = threadIdx.x; j < nj; j += blockDim.x)
       for (int n = threadIdx.y; n < nwalk; n += blockDim.y)
         B_[a0 + j * nwalk + n] = static_cast<T2>(A_[a1 + j * nwalk + n]);
@@ -67,6 +70,7 @@ __global__ void kernel_KaKjw_to_QKajw(int nwalk,
 template<typename T, typename T2>
 __global__ void kernel_KaKjw_to_QKajw(int nwalk,
                                       int nkpts,
+                                      int npol,
                                       int nmo_max,
                                       int nmo_tot,
                                       int nocc_max,
@@ -80,7 +84,8 @@ __global__ void kernel_KaKjw_to_QKajw(int nwalk,
 {
   int Q = blockIdx.x;
   int K = blockIdx.y;
-  if (Q >= nkpts || K >= nkpts)
+  int pol = blockIdx.z;
+  if (Q >= nkpts || K >= nkpts || pol > npol)
     return;
   int QK  = QKtok2[Q * nkpts + K];
   int na0 = nocc0[K];
@@ -88,15 +93,16 @@ __global__ void kernel_KaKjw_to_QKajw(int nwalk,
   int na  = nocc[K];
   int nj  = nmo[QK];
 
-  thrust::complex<T> const* A_(A + (na0 * nmo_tot + nj0) * nwalk);
-  thrust::complex<T2>* B_(B + ((Q * nkpts + K) * nocc_max) * nmo_max * nwalk);
+  thrust::complex<T> const* A_(A + (na0 * npol * nmo_tot + nj0) * nwalk);
+  thrust::complex<T2>* B_(B + ((Q * nkpts + K) * nocc_max) * npol * nmo_max * nwalk);
 
   if (threadIdx.x >= nj)
     return;
   if (threadIdx.y >= nwalk)
     return;
 
-  for (int a = 0, a0 = 0, a1 = 0; a < na; a++, a0 += nmo_max * nwalk, a1 += nmo_tot * nwalk)
+  for (int a = 0, a0 = pol*nmo_max*nwalk, a1 = pol*nmo_tot*nwalk; 
+                a < na; a++, a0 += npol*nmo_max * nwalk, a1 += npol * nmo_tot * nwalk)
   {
     for (int j = threadIdx.x; j < nj; j += blockDim.x)
       for (int n = threadIdx.y; n < nwalk; n += blockDim.y)
@@ -106,6 +112,7 @@ __global__ void kernel_KaKjw_to_QKajw(int nwalk,
 
 void KaKjw_to_QKajw(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -120,8 +127,8 @@ void KaKjw_to_QKajw(int nwalk,
   int xblock_dim = 16;
   int yblock_dim = std::min(nwalk, 32);
   dim3 block_dim(xblock_dim, yblock_dim, 1);
-  dim3 grid_dim(nkpts, nkpts, 1);
-  kernel_KaKjw_to_QKajw<<<grid_dim, block_dim>>>(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0,
+  dim3 grid_dim(nkpts, nkpts, npol);
+  kernel_KaKjw_to_QKajw<<<grid_dim, block_dim>>>(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0,
                                                  QKtok2, A, B);
   qmc_cuda::cuda_check(cudaGetLastError(), "KaKjw_to_QKajw");
   qmc_cuda::cuda_check(cudaDeviceSynchronize(), "KaKjw_to_QKajw");
@@ -129,6 +136,7 @@ void KaKjw_to_QKajw(int nwalk,
 
 void KaKjw_to_QKajw(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -143,8 +151,8 @@ void KaKjw_to_QKajw(int nwalk,
   int xblock_dim = 16;
   int yblock_dim = std::min(nwalk, 32);
   dim3 block_dim(xblock_dim, yblock_dim, 1);
-  dim3 grid_dim(nkpts, nkpts, 1);
-  kernel_KaKjw_to_QKajw<<<grid_dim, block_dim>>>(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0,
+  dim3 grid_dim(nkpts, nkpts, npol);
+  kernel_KaKjw_to_QKajw<<<grid_dim, block_dim>>>(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0,
                                                  QKtok2, A, B);
   qmc_cuda::cuda_check(cudaGetLastError(), "KaKjw_to_QKajw");
   qmc_cuda::cuda_check(cudaDeviceSynchronize(), "KaKjw_to_QKajw");
@@ -152,6 +160,7 @@ void KaKjw_to_QKajw(int nwalk,
 
 void KaKjw_to_QKajw(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -166,8 +175,8 @@ void KaKjw_to_QKajw(int nwalk,
   int xblock_dim = 16;
   int yblock_dim = std::min(nwalk, 32);
   dim3 block_dim(xblock_dim, yblock_dim, 1);
-  dim3 grid_dim(nkpts, nkpts, 1);
-  kernel_KaKjw_to_QKajw<<<grid_dim, block_dim>>>(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0,
+  dim3 grid_dim(nkpts, nkpts, npol);
+  kernel_KaKjw_to_QKajw<<<grid_dim, block_dim>>>(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0,
                                                  QKtok2, A, B);
   qmc_cuda::cuda_check(cudaGetLastError(), "KaKjw_to_QKajw");
   qmc_cuda::cuda_check(cudaDeviceSynchronize(), "KaKjw_to_QKajw");
@@ -175,6 +184,7 @@ void KaKjw_to_QKajw(int nwalk,
 
 void KaKjw_to_QKajw(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -189,8 +199,8 @@ void KaKjw_to_QKajw(int nwalk,
   int xblock_dim = 16;
   int yblock_dim = std::min(nwalk, 32);
   dim3 block_dim(xblock_dim, yblock_dim, 1);
-  dim3 grid_dim(nkpts, nkpts, 1);
-  kernel_KaKjw_to_QKajw<<<grid_dim, block_dim>>>(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0,
+  dim3 grid_dim(nkpts, nkpts, npol);
+  kernel_KaKjw_to_QKajw<<<grid_dim, block_dim>>>(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0,
                                                  QKtok2, reinterpret_cast<thrust::complex<float> const*>(A),
                                                  reinterpret_cast<thrust::complex<float>*>(B));
   qmc_cuda::cuda_check(cudaGetLastError(), "KaKjw_to_QKajw");
@@ -199,6 +209,7 @@ void KaKjw_to_QKajw(int nwalk,
 
 void KaKjw_to_QKajw(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -213,8 +224,8 @@ void KaKjw_to_QKajw(int nwalk,
   int xblock_dim = 16;
   int yblock_dim = std::min(nwalk, 32);
   dim3 block_dim(xblock_dim, yblock_dim, 1);
-  dim3 grid_dim(nkpts, nkpts, 1);
-  kernel_KaKjw_to_QKajw<<<grid_dim, block_dim>>>(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0,
+  dim3 grid_dim(nkpts, nkpts, npol);
+  kernel_KaKjw_to_QKajw<<<grid_dim, block_dim>>>(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0,
                                                  QKtok2, reinterpret_cast<thrust::complex<double> const*>(A),
                                                  reinterpret_cast<thrust::complex<double>*>(B));
   qmc_cuda::cuda_check(cudaGetLastError(), "KaKjw_to_QKajw");
@@ -223,6 +234,7 @@ void KaKjw_to_QKajw(int nwalk,
 
 void KaKjw_to_QKajw(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -237,8 +249,8 @@ void KaKjw_to_QKajw(int nwalk,
   int xblock_dim = 16;
   int yblock_dim = std::min(nwalk, 32);
   dim3 block_dim(xblock_dim, yblock_dim, 1);
-  dim3 grid_dim(nkpts, nkpts, 1);
-  kernel_KaKjw_to_QKajw<<<grid_dim, block_dim>>>(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0,
+  dim3 grid_dim(nkpts, nkpts, npol);
+  kernel_KaKjw_to_QKajw<<<grid_dim, block_dim>>>(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0,
                                                  QKtok2, reinterpret_cast<thrust::complex<double> const*>(A),
                                                  reinterpret_cast<thrust::complex<float>*>(B));
   qmc_cuda::cuda_check(cudaGetLastError(), "KaKjw_to_QKajw");
diff --git a/src/AFQMC/Numerics/detail/CUDA/Kernels/KaKjw_to_QKajw.cuh b/src/AFQMC/Numerics/detail/CUDA/Kernels/KaKjw_to_QKajw.cuh
index 164e6c598..0315a29b9 100644
--- a/src/AFQMC/Numerics/detail/CUDA/Kernels/KaKjw_to_QKajw.cuh
+++ b/src/AFQMC/Numerics/detail/CUDA/Kernels/KaKjw_to_QKajw.cuh
@@ -23,6 +23,7 @@ namespace kernels
 {
 void KaKjw_to_QKajw(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -35,6 +36,7 @@ void KaKjw_to_QKajw(int nwalk,
                     double* B);
 void KaKjw_to_QKajw(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -47,6 +49,7 @@ void KaKjw_to_QKajw(int nwalk,
                     float* B);
 void KaKjw_to_QKajw(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -59,6 +62,7 @@ void KaKjw_to_QKajw(int nwalk,
                     float* B);
 void KaKjw_to_QKajw(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -71,6 +75,7 @@ void KaKjw_to_QKajw(int nwalk,
                     std::complex<float>* B);
 void KaKjw_to_QKajw(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -83,6 +88,7 @@ void KaKjw_to_QKajw(int nwalk,
                     std::complex<double>* B);
 void KaKjw_to_QKajw(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
diff --git a/src/AFQMC/Numerics/detail/HIP/Kernels/KaKjw_to_KKwaj.hip.cpp b/src/AFQMC/Numerics/detail/HIP/Kernels/KaKjw_to_KKwaj.hip.cpp
index b4bd304ba..0bbcd0157 100644
--- a/src/AFQMC/Numerics/detail/HIP/Kernels/KaKjw_to_KKwaj.hip.cpp
+++ b/src/AFQMC/Numerics/detail/HIP/Kernels/KaKjw_to_KKwaj.hip.cpp
@@ -25,6 +25,7 @@ namespace kernels
 template<typename T, typename T2>
 __global__ void kernel_KaKjw_to_KKwaj(int nwalk,
                                       int nkpts,
+                                      int npol,
                                       int nmo_max,
                                       int nmo_tot,
                                       int nocc_max,
@@ -37,30 +38,32 @@ __global__ void kernel_KaKjw_to_KKwaj(int nwalk,
 {
   int Ka = blockIdx.x;
   int Kj = blockIdx.y;
-  if (Ka >= nkpts || Kj >= nkpts)
+  int pol = blockIdx.z;
+  if (Ka >= nkpts || Kj >= nkpts || pol >= npol)
     return;
   int na0 = nocc0[Ka];
   int nj0 = nmo0[Kj];
   int na  = nocc[Ka];
   int nj  = nmo[Kj];
 
-  T const* A_(A + (na0 * nmo_tot + nj0) * nwalk);
-  T2* B_(B + ((Ka * nkpts + Kj) * nocc_max) * nmo_max * nwalk);
+  T const* A_(A + (na0 * npol * nmo_tot + nj0) * nwalk);
+  T2* B_(B + ((Ka * nkpts + Kj) * nocc_max) * npol * nmo_max * nwalk);
 
   if (threadIdx.x >= nj)
     return;
   if (threadIdx.y >= nwalk)
     return;
 
-  for (int a = 0, a1 = 0; a < na; a++, a1 += nmo_tot * nwalk)
+  for (int a = 0, a1 = pol * nmo_tot * nwalk; a < na; a++, a1 += npol * nmo_tot * nwalk)
     for (int j = threadIdx.x; j < nj; j += blockDim.x)
       for (int n = threadIdx.y; n < nwalk; n += blockDim.y)
-        B_[n * nocc_max * nmo_max + a * nmo_max + j] = static_cast<T2>(A_[a1 + j * nwalk + n]);
+        B_[((n * nocc_max + a) * npol + pol) * nmo_max + j] = static_cast<T2>(A_[a1 + j * nwalk + n]);
 }
 
 template<typename T, typename T2>
 __global__ void kernel_KaKjw_to_KKwaj(int nwalk,
                                       int nkpts,
+                                      int npol,
                                       int nmo_max,
                                       int nmo_tot,
                                       int nocc_max,
@@ -73,30 +76,32 @@ __global__ void kernel_KaKjw_to_KKwaj(int nwalk,
 {
   int Ka = blockIdx.x;
   int Kj = blockIdx.y;
-  if (Ka >= nkpts || Kj >= nkpts)
+  int pol = blockIdx.z;
+  if (Ka >= nkpts || Kj >= nkpts || pol >= npol)
     return;
   int na0 = nocc0[Ka];
   int nj0 = nmo0[Kj];
   int na  = nocc[Ka];
   int nj  = nmo[Kj];
 
-  thrust::complex<T> const* A_(A + (na0 * nmo_tot + nj0) * nwalk);
-  thrust::complex<T2>* B_(B + ((Ka * nkpts + Kj) * nocc_max) * nmo_max * nwalk);
+  thrust::complex<T> const* A_(A + (na0 * npol * nmo_tot + nj0) * nwalk);
+  thrust::complex<T2>* B_(B + ((Ka * nkpts + Kj) * nocc_max) * npol * nmo_max * nwalk);
 
   if (threadIdx.x >= nj)
     return;
   if (threadIdx.y >= nwalk)
     return;
 
-  for (int a = 0, a1 = 0; a < na; a++, a1 += nmo_tot * nwalk)
+  for (int a = 0, a1 = pol * nmo_tot * nwalk; a < na; a++, a1 += npol * nmo_tot * nwalk)
     for (int j = threadIdx.x; j < nj; j += blockDim.x)
       for (int n = threadIdx.y; n < nwalk; n += blockDim.y)
-        B_[n * nocc_max * nmo_max + a * nmo_max + j] = static_cast<thrust::complex<T2>>(A_[a1 + j * nwalk + n]);
+        B_[((n * nocc_max + a) * npol + pol) * nmo_max + j] = static_cast<thrust::complex<T2>>(A_[a1 + j * nwalk + n]);
 }
 
 
 void KaKjw_to_KKwaj(int nwalk,
-                    int nkpts,
+                    int nkpts, 
+                    int npol, 
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -110,8 +115,8 @@ void KaKjw_to_KKwaj(int nwalk,
   int xblock_dim = 16;
   int yblock_dim = std::min(nwalk, 32);
   dim3 block_dim(xblock_dim, yblock_dim, 1);
-  dim3 grid_dim(nkpts, nkpts, 1);
-  hipLaunchKernelGGL(kernel_KaKjw_to_KKwaj, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, nmo_max, nmo_tot,
+  dim3 grid_dim(nkpts, nkpts, npol);
+  hipLaunchKernelGGL(kernel_KaKjw_to_KKwaj, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, npol, nmo_max, nmo_tot,
                      nocc_max, nmo, nmo0, nocc, nocc0, A, B);
   qmc_hip::hip_check(hipGetLastError(), "KaKjw_to_KKwaj");
   qmc_hip::hip_check(hipDeviceSynchronize(), "KaKjw_to_KKwaj");
@@ -119,6 +124,7 @@ void KaKjw_to_KKwaj(int nwalk,
 
 void KaKjw_to_KKwaj(int nwalk,
                     int nkpts,
+                    int npol, 
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -132,8 +138,8 @@ void KaKjw_to_KKwaj(int nwalk,
   int xblock_dim = 16;
   int yblock_dim = std::min(nwalk, 32);
   dim3 block_dim(xblock_dim, yblock_dim, 1);
-  dim3 grid_dim(nkpts, nkpts, 1);
-  hipLaunchKernelGGL(kernel_KaKjw_to_KKwaj, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, nmo_max, nmo_tot,
+  dim3 grid_dim(nkpts, nkpts, npol);
+  hipLaunchKernelGGL(kernel_KaKjw_to_KKwaj, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, npol, nmo_max, nmo_tot,
                      nocc_max, nmo, nmo0, nocc, nocc0, A, B);
   qmc_hip::hip_check(hipGetLastError(), "KaKjw_to_KKwaj");
   qmc_hip::hip_check(hipDeviceSynchronize(), "KaKjw_to_KKwaj");
@@ -141,6 +147,7 @@ void KaKjw_to_KKwaj(int nwalk,
 
 void KaKjw_to_KKwaj(int nwalk,
                     int nkpts,
+                    int npol, 
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -154,8 +161,8 @@ void KaKjw_to_KKwaj(int nwalk,
   int xblock_dim = 16;
   int yblock_dim = std::min(nwalk, 32);
   dim3 block_dim(xblock_dim, yblock_dim, 1);
-  dim3 grid_dim(nkpts, nkpts, 1);
-  hipLaunchKernelGGL(kernel_KaKjw_to_KKwaj, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, nmo_max, nmo_tot,
+  dim3 grid_dim(nkpts, nkpts, npol);
+  hipLaunchKernelGGL(kernel_KaKjw_to_KKwaj, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, npol, nmo_max, nmo_tot,
                      nocc_max, nmo, nmo0, nocc, nocc0, A, B);
   qmc_hip::hip_check(hipGetLastError(), "KaKjw_to_KKwaj");
   qmc_hip::hip_check(hipDeviceSynchronize(), "KaKjw_to_KKwaj");
@@ -163,6 +170,7 @@ void KaKjw_to_KKwaj(int nwalk,
 
 void KaKjw_to_KKwaj(int nwalk,
                     int nkpts,
+                    int npol, 
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -176,8 +184,8 @@ void KaKjw_to_KKwaj(int nwalk,
   int xblock_dim = 16;
   int yblock_dim = std::min(nwalk, 32);
   dim3 block_dim(xblock_dim, yblock_dim, 1);
-  dim3 grid_dim(nkpts, nkpts, 1);
-  hipLaunchKernelGGL(kernel_KaKjw_to_KKwaj, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, nmo_max, nmo_tot,
+  dim3 grid_dim(nkpts, nkpts, npol);
+  hipLaunchKernelGGL(kernel_KaKjw_to_KKwaj, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, npol, nmo_max, nmo_tot,
                      nocc_max, nmo, nmo0, nocc, nocc0, reinterpret_cast<thrust::complex<double> const*>(A),
                      reinterpret_cast<thrust::complex<double>*>(B));
   qmc_hip::hip_check(hipGetLastError(), "KaKjw_to_KKwaj");
@@ -186,6 +194,7 @@ void KaKjw_to_KKwaj(int nwalk,
 
 void KaKjw_to_KKwaj(int nwalk,
                     int nkpts,
+                    int npol, 
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -199,8 +208,8 @@ void KaKjw_to_KKwaj(int nwalk,
   int xblock_dim = 16;
   int yblock_dim = std::min(nwalk, 32);
   dim3 block_dim(xblock_dim, yblock_dim, 1);
-  dim3 grid_dim(nkpts, nkpts, 1);
-  hipLaunchKernelGGL(kernel_KaKjw_to_KKwaj, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, nmo_max, nmo_tot,
+  dim3 grid_dim(nkpts, nkpts, npol);
+  hipLaunchKernelGGL(kernel_KaKjw_to_KKwaj, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, npol, nmo_max, nmo_tot,
                      nocc_max, nmo, nmo0, nocc, nocc0, reinterpret_cast<thrust::complex<float> const*>(A),
                      reinterpret_cast<thrust::complex<float>*>(B));
   qmc_hip::hip_check(hipGetLastError(), "KaKjw_to_KKwaj");
@@ -209,6 +218,7 @@ void KaKjw_to_KKwaj(int nwalk,
 
 void KaKjw_to_KKwaj(int nwalk,
                     int nkpts,
+                    int npol, 
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -222,8 +232,8 @@ void KaKjw_to_KKwaj(int nwalk,
   int xblock_dim = 16;
   int yblock_dim = std::min(nwalk, 32);
   dim3 block_dim(xblock_dim, yblock_dim, 1);
-  dim3 grid_dim(nkpts, nkpts, 1);
-  hipLaunchKernelGGL(kernel_KaKjw_to_KKwaj, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, nmo_max, nmo_tot,
+  dim3 grid_dim(nkpts, nkpts, npol);
+  hipLaunchKernelGGL(kernel_KaKjw_to_KKwaj, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, npol, nmo_max, nmo_tot,
                      nocc_max, nmo, nmo0, nocc, nocc0, reinterpret_cast<thrust::complex<double> const*>(A),
                      reinterpret_cast<thrust::complex<float>*>(B));
   qmc_hip::hip_check(hipGetLastError(), "KaKjw_to_KKwaj");
diff --git a/src/AFQMC/Numerics/detail/HIP/Kernels/KaKjw_to_KKwaj.hip.h b/src/AFQMC/Numerics/detail/HIP/Kernels/KaKjw_to_KKwaj.hip.h
index 1fa43f492..ebab86b49 100644
--- a/src/AFQMC/Numerics/detail/HIP/Kernels/KaKjw_to_KKwaj.hip.h
+++ b/src/AFQMC/Numerics/detail/HIP/Kernels/KaKjw_to_KKwaj.hip.h
@@ -20,6 +20,7 @@ namespace kernels
 {
 void KaKjw_to_KKwaj(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -31,6 +32,7 @@ void KaKjw_to_KKwaj(int nwalk,
                     double* B);
 void KaKjw_to_KKwaj(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -42,6 +44,7 @@ void KaKjw_to_KKwaj(int nwalk,
                     float* B);
 void KaKjw_to_KKwaj(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -53,6 +56,7 @@ void KaKjw_to_KKwaj(int nwalk,
                     float* B);
 void KaKjw_to_KKwaj(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -64,6 +68,7 @@ void KaKjw_to_KKwaj(int nwalk,
                     std::complex<double>* B);
 void KaKjw_to_KKwaj(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -75,6 +80,7 @@ void KaKjw_to_KKwaj(int nwalk,
                     std::complex<float>* B);
 void KaKjw_to_KKwaj(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
diff --git a/src/AFQMC/Numerics/detail/HIP/Kernels/KaKjw_to_QKajw.hip.cpp b/src/AFQMC/Numerics/detail/HIP/Kernels/KaKjw_to_QKajw.hip.cpp
index 9b9ca77cc..6fd46e0bc 100644
--- a/src/AFQMC/Numerics/detail/HIP/Kernels/KaKjw_to_QKajw.hip.cpp
+++ b/src/AFQMC/Numerics/detail/HIP/Kernels/KaKjw_to_QKajw.hip.cpp
@@ -26,6 +26,7 @@ namespace kernels
 template<typename T, typename T2>
 __global__ void kernel_KaKjw_to_QKajw(int nwalk,
                                       int nkpts,
+                                      int npol,
                                       int nmo_max,
                                       int nmo_tot,
                                       int nocc_max,
@@ -39,7 +40,8 @@ __global__ void kernel_KaKjw_to_QKajw(int nwalk,
 {
   int Q = blockIdx.x;
   int K = blockIdx.y;
-  if (Q >= nkpts || K >= nkpts)
+  int pol = blockIdx.z;
+  if (Q >= nkpts || K >= nkpts || pol > npol)
     return;
   int QK  = QKtok2[Q * nkpts + K];
   int na0 = nocc0[K];
@@ -47,15 +49,16 @@ __global__ void kernel_KaKjw_to_QKajw(int nwalk,
   int na  = nocc[K];
   int nj  = nmo[QK];
 
-  T const* A_(A + (na0 * nmo_tot + nj0) * nwalk);
-  T2* B_(B + ((Q * nkpts + K) * nocc_max) * nmo_max * nwalk);
+  T const* A_(A + (na0 * npol * nmo_tot + nj0) * nwalk);
+  T2* B_(B + ((Q * nkpts + K) * nocc_max) * npol * nmo_max * nwalk);
 
   if (threadIdx.x >= nj)
     return;
   if (threadIdx.y >= nwalk)
     return;
 
-  for (int a = 0, a0 = 0, a1 = 0; a < na; a++, a0 += nmo_max * nwalk, a1 += nmo_tot * nwalk)
+  for (int a = 0, a0 = pol*nmo_max*nwalk, a1 = pol*nmo_tot*nwalk; 
+                a < na; a++, a0 += npol * nmo_max * nwalk, a1 += npol * nmo_tot * nwalk)
     for (int j = threadIdx.x; j < nj; j += blockDim.x)
       for (int n = threadIdx.y; n < nwalk; n += blockDim.y)
         B_[a0 + j * nwalk + n] = static_cast<T2>(A_[a1 + j * nwalk + n]);
@@ -64,6 +67,7 @@ __global__ void kernel_KaKjw_to_QKajw(int nwalk,
 template<typename T, typename T2>
 __global__ void kernel_KaKjw_to_QKajw(int nwalk,
                                       int nkpts,
+                                      int npol,
                                       int nmo_max,
                                       int nmo_tot,
                                       int nocc_max,
@@ -77,7 +81,8 @@ __global__ void kernel_KaKjw_to_QKajw(int nwalk,
 {
   int Q = blockIdx.x;
   int K = blockIdx.y;
-  if (Q >= nkpts || K >= nkpts)
+  int pol = blockIdx.z;
+  if (Q >= nkpts || K >= nkpts || pol > npol)
     return;
   int QK  = QKtok2[Q * nkpts + K];
   int na0 = nocc0[K];
@@ -85,15 +90,16 @@ __global__ void kernel_KaKjw_to_QKajw(int nwalk,
   int na  = nocc[K];
   int nj  = nmo[QK];
 
-  thrust::complex<T> const* A_(A + (na0 * nmo_tot + nj0) * nwalk);
-  thrust::complex<T2>* B_(B + ((Q * nkpts + K) * nocc_max) * nmo_max * nwalk);
+  thrust::complex<T> const* A_(A + (na0 * npol * nmo_tot + nj0) * nwalk);
+  thrust::complex<T2>* B_(B + ((Q * nkpts + K) * npol * nocc_max) * nmo_max * nwalk);
 
   if (threadIdx.x >= nj)
     return;
   if (threadIdx.y >= nwalk)
     return;
 
-  for (int a = 0, a0 = 0, a1 = 0; a < na; a++, a0 += nmo_max * nwalk, a1 += nmo_tot * nwalk)
+  for (int a = 0, a0 = pol*nmo_max*nwalk, a1 = pol*nmo_tot*nwalk; 
+            a < na; a++, a0 += npol * nmo_max * nwalk, a1 += npol * nmo_tot * nwalk)
   {
     for (int j = threadIdx.x; j < nj; j += blockDim.x)
       for (int n = threadIdx.y; n < nwalk; n += blockDim.y)
@@ -103,6 +109,7 @@ __global__ void kernel_KaKjw_to_QKajw(int nwalk,
 
 void KaKjw_to_QKajw(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -117,8 +124,8 @@ void KaKjw_to_QKajw(int nwalk,
   int xblock_dim = 16;
   int yblock_dim = std::min(nwalk, 32);
   dim3 block_dim(xblock_dim, yblock_dim, 1);
-  dim3 grid_dim(nkpts, nkpts, 1);
-  hipLaunchKernelGGL(kernel_KaKjw_to_QKajw, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, nmo_max, nmo_tot,
+  dim3 grid_dim(nkpts, nkpts, npol);
+  hipLaunchKernelGGL(kernel_KaKjw_to_QKajw, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, npol, nmo_max, nmo_tot,
                      nocc_max, nmo, nmo0, nocc, nocc0, QKtok2, A, B);
   qmc_hip::hip_check(hipGetLastError(), "KaKjw_to_QKajw");
   qmc_hip::hip_check(hipDeviceSynchronize(), "KaKjw_to_QKajw");
@@ -126,6 +133,7 @@ void KaKjw_to_QKajw(int nwalk,
 
 void KaKjw_to_QKajw(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -140,8 +148,8 @@ void KaKjw_to_QKajw(int nwalk,
   int xblock_dim = 16;
   int yblock_dim = std::min(nwalk, 32);
   dim3 block_dim(xblock_dim, yblock_dim, 1);
-  dim3 grid_dim(nkpts, nkpts, 1);
-  hipLaunchKernelGGL(kernel_KaKjw_to_QKajw, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, nmo_max, nmo_tot,
+  dim3 grid_dim(nkpts, nkpts, npol);
+  hipLaunchKernelGGL(kernel_KaKjw_to_QKajw, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, npol, nmo_max, nmo_tot,
                      nocc_max, nmo, nmo0, nocc, nocc0, QKtok2, A, B);
   qmc_hip::hip_check(hipGetLastError(), "KaKjw_to_QKajw");
   qmc_hip::hip_check(hipDeviceSynchronize(), "KaKjw_to_QKajw");
@@ -149,6 +157,7 @@ void KaKjw_to_QKajw(int nwalk,
 
 void KaKjw_to_QKajw(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -163,8 +172,8 @@ void KaKjw_to_QKajw(int nwalk,
   int xblock_dim = 16;
   int yblock_dim = std::min(nwalk, 32);
   dim3 block_dim(xblock_dim, yblock_dim, 1);
-  dim3 grid_dim(nkpts, nkpts, 1);
-  hipLaunchKernelGGL(kernel_KaKjw_to_QKajw, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, nmo_max, nmo_tot,
+  dim3 grid_dim(nkpts, nkpts, npol);
+  hipLaunchKernelGGL(kernel_KaKjw_to_QKajw, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, npol, nmo_max, nmo_tot,
                      nocc_max, nmo, nmo0, nocc, nocc0, QKtok2, A, B);
   qmc_hip::hip_check(hipGetLastError(), "KaKjw_to_QKajw");
   qmc_hip::hip_check(hipDeviceSynchronize(), "KaKjw_to_QKajw");
@@ -172,6 +181,7 @@ void KaKjw_to_QKajw(int nwalk,
 
 void KaKjw_to_QKajw(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -186,8 +196,8 @@ void KaKjw_to_QKajw(int nwalk,
   int xblock_dim = 16;
   int yblock_dim = std::min(nwalk, 32);
   dim3 block_dim(xblock_dim, yblock_dim, 1);
-  dim3 grid_dim(nkpts, nkpts, 1);
-  hipLaunchKernelGGL(kernel_KaKjw_to_QKajw, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, nmo_max, nmo_tot,
+  dim3 grid_dim(nkpts, nkpts, npol);
+  hipLaunchKernelGGL(kernel_KaKjw_to_QKajw, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, npol, nmo_max, nmo_tot,
                      nocc_max, nmo, nmo0, nocc, nocc0, QKtok2, reinterpret_cast<thrust::complex<float> const*>(A),
                      reinterpret_cast<thrust::complex<float>*>(B));
   qmc_hip::hip_check(hipGetLastError(), "KaKjw_to_QKajw");
@@ -196,6 +206,7 @@ void KaKjw_to_QKajw(int nwalk,
 
 void KaKjw_to_QKajw(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -210,8 +221,8 @@ void KaKjw_to_QKajw(int nwalk,
   int xblock_dim = 16;
   int yblock_dim = std::min(nwalk, 32);
   dim3 block_dim(xblock_dim, yblock_dim, 1);
-  dim3 grid_dim(nkpts, nkpts, 1);
-  hipLaunchKernelGGL(kernel_KaKjw_to_QKajw, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, nmo_max, nmo_tot,
+  dim3 grid_dim(nkpts, nkpts, npol);
+  hipLaunchKernelGGL(kernel_KaKjw_to_QKajw, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, npol, nmo_max, nmo_tot,
                      nocc_max, nmo, nmo0, nocc, nocc0, QKtok2, reinterpret_cast<thrust::complex<double> const*>(A),
                      reinterpret_cast<thrust::complex<double>*>(B));
   qmc_hip::hip_check(hipGetLastError(), "KaKjw_to_QKajw");
@@ -220,6 +231,7 @@ void KaKjw_to_QKajw(int nwalk,
 
 void KaKjw_to_QKajw(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -234,8 +246,8 @@ void KaKjw_to_QKajw(int nwalk,
   int xblock_dim = 16;
   int yblock_dim = std::min(nwalk, 32);
   dim3 block_dim(xblock_dim, yblock_dim, 1);
-  dim3 grid_dim(nkpts, nkpts, 1);
-  hipLaunchKernelGGL(kernel_KaKjw_to_QKajw, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, nmo_max, nmo_tot,
+  dim3 grid_dim(nkpts, nkpts, npol);
+  hipLaunchKernelGGL(kernel_KaKjw_to_QKajw, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, npol, nmo_max, nmo_tot,
                      nocc_max, nmo, nmo0, nocc, nocc0, QKtok2, reinterpret_cast<thrust::complex<double> const*>(A),
                      reinterpret_cast<thrust::complex<float>*>(B));
   qmc_hip::hip_check(hipGetLastError(), "KaKjw_to_QKajw");
diff --git a/src/AFQMC/Numerics/detail/HIP/Kernels/KaKjw_to_QKajw.hip.h b/src/AFQMC/Numerics/detail/HIP/Kernels/KaKjw_to_QKajw.hip.h
index e773dce13..c8b84a021 100644
--- a/src/AFQMC/Numerics/detail/HIP/Kernels/KaKjw_to_QKajw.hip.h
+++ b/src/AFQMC/Numerics/detail/HIP/Kernels/KaKjw_to_QKajw.hip.h
@@ -20,6 +20,7 @@ namespace kernels
 {
 void KaKjw_to_QKajw(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -32,6 +33,7 @@ void KaKjw_to_QKajw(int nwalk,
                     double* B);
 void KaKjw_to_QKajw(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -44,6 +46,7 @@ void KaKjw_to_QKajw(int nwalk,
                     float* B);
 void KaKjw_to_QKajw(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -56,6 +59,7 @@ void KaKjw_to_QKajw(int nwalk,
                     float* B);
 void KaKjw_to_QKajw(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -68,6 +72,7 @@ void KaKjw_to_QKajw(int nwalk,
                     std::complex<float>* B);
 void KaKjw_to_QKajw(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -80,6 +85,7 @@ void KaKjw_to_QKajw(int nwalk,
                     std::complex<double>* B);
 void KaKjw_to_QKajw(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
diff --git a/src/AFQMC/Numerics/tensor_operations.hpp b/src/AFQMC/Numerics/tensor_operations.hpp
index f4ea13191..e93757a64 100644
--- a/src/AFQMC/Numerics/tensor_operations.hpp
+++ b/src/AFQMC/Numerics/tensor_operations.hpp
@@ -29,6 +29,7 @@ namespace ma
 template<typename Q, typename T>
 void KaKjw_to_KKwaj(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -40,7 +41,7 @@ void KaKjw_to_KKwaj(int nwalk,
                     T* B)
 {
   // OpenMP: Combine Ka,Kj loops into single loop and call parallel for
-  int naj = nocc_max * nmo_max;
+  int napj = nocc_max * npol * nmo_max;
   int na0 = 0;
   for (int Ka = 0; Ka < nkpts; Ka++)
   {
@@ -50,16 +51,19 @@ void KaKjw_to_KKwaj(int nwalk,
     {
       int nj = nopk[Kj];
       //auto G_(to_address(GKK[0][Ka][Kj].origin()));
-      auto G_(B + (Ka * nkpts + Kj) * nwalk * nocc_max * nmo_max);
+      auto G_(B + (Ka * nkpts + Kj) * nwalk * nocc_max * npol * nmo_max);
       for (int a = 0; a < na; a++)
       {
-        //auto Gc_( to_address(Gca[na0+a][nj0].origin()) );
-        auto Gc_(A + (na0 + a) * nmo_tot * nwalk + nj0 * nwalk);
-        int aj = a * nmo_max;
-        for (int j = 0; j < nj; j++, aj++)
+        //auto Gc_( to_address(Gca[na0+a][p][nj0].origin()) );
+        int apj = a * npol * nmo_max;
+        for (int p = 0; p < npol; p++)
         {
-          for (int w = 0, waj = 0; w < nwalk; w++, ++Gc_, waj += naj)
-            G_[waj + aj] = static_cast<T>(*Gc_);
+          auto Gc_(A + ((na0 + a) * npol + p ) * nmo_tot * nwalk + nj0 * nwalk);
+          for (int j = 0; j < nj; j++, apj++)
+          {
+            for (int w = 0, wapj = 0; w < nwalk; w++, ++Gc_, wapj += napj)
+              G_[wapj + apj] = static_cast<T>(*Gc_);
+          }
         }
       }
       nj0 += nj;
@@ -71,6 +75,7 @@ void KaKjw_to_KKwaj(int nwalk,
 template<typename T, typename T1>
 void KaKjw_to_QKajw(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -94,16 +99,19 @@ void KaKjw_to_QKajw(int nwalk,
       int na0 = nocc0[Ka];
       int nj0 = nmo0[Kj];
       //auto G_(to_address(GKK[Q][K].origin()));
-      auto G_(B + (Q * nkpts + K) * nwalk * nocc_max * nmo_max);
-      for (int a = 0, a0 = 0; a < na; a++, a0 += nmo_max * nwalk)
+      auto G_(B + (Q * nkpts + K) * nwalk * nocc_max * npol * nmo_max);
+      for (int a = 0, a0 = 0; a < na; a++)
       {
-        //auto Gc_( to_address(Gca[na0+a][nj0].origin()) );
-        auto Gc_(A + (na0 + a) * nmo_tot * nwalk + nj0 * nwalk);
-        for (int j = 0, aj = a0; j < nj; j++, aj += nwalk)
+        for (int p = 0; p < npol; p++, a0 += nmo_max * nwalk)
         {
-          for (int w = 0; w < nwalk; w++, ++Gc_)
+          //auto Gc_( to_address(Gca[na0+a][p][nj0].origin()) );
+          auto Gc_(A + ((na0 + a) * npol + p) * nmo_tot * nwalk + nj0 * nwalk);
+          for (int j = 0, apj = a0; j < nj; j++, apj += nwalk)
           {
-            G_[aj + w] = static_cast<T>(*Gc_);
+            for (int w = 0; w < nwalk; w++, ++Gc_)
+            {
+              G_[apj + w] = static_cast<T>(*Gc_);
+            }
           }
         }
       }
@@ -275,6 +283,7 @@ namespace device
 template<typename T, typename Q>
 void KaKjw_to_KKwaj(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -285,13 +294,14 @@ void KaKjw_to_KKwaj(int nwalk,
                     device_pointer<Q> A,
                     device_pointer<T> B)
 {
-  kernels::KaKjw_to_KKwaj(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, to_address(nmo), to_address(nmo0), to_address(nocc),
+  kernels::KaKjw_to_KKwaj(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, to_address(nmo), to_address(nmo0), to_address(nocc),
                           to_address(nocc0), to_address(A), to_address(B));
 }
 
 template<typename T, typename Q>
 void KaKjw_to_QKajw(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -303,7 +313,7 @@ void KaKjw_to_QKajw(int nwalk,
                     device_pointer<Q> A,
                     device_pointer<T> B)
 {
-  kernels::KaKjw_to_QKajw(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, to_address(nmo), to_address(nmo0), to_address(nocc),
+  kernels::KaKjw_to_QKajw(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, to_address(nmo), to_address(nmo0), to_address(nocc),
                           to_address(nocc0), to_address(QKtok2), to_address(A), to_address(B));
 }
 
diff --git a/src/AFQMC/Numerics/tests/test_tensor_operations.cpp b/src/AFQMC/Numerics/tests/test_tensor_operations.cpp
index 8b54c4124..b527aee87 100644
--- a/src/AFQMC/Numerics/tests/test_tensor_operations.cpp
+++ b/src/AFQMC/Numerics/tests/test_tensor_operations.cpp
@@ -100,7 +100,7 @@ TEST_CASE("KaKjw_to_KKwaj", "[Numerics][tensor_operations]")
   // KaKjw = numpy.arange(nk*na*nk*nj*nw).reshape((nk,na,nk,nj,nw))
   // KKwaj = numpy.transpose(KaKjw, (0,2,1,3,4))
   using ma::KaKjw_to_KKwaj;
-  KaKjw_to_KKwaj(nwalk, nk, nmo_k, nk * nmo_k, occ_k, nmo_pk.origin(), nmo_pk0.origin(), nel_pk.origin(),
+  KaKjw_to_KKwaj(nwalk, nk, 1, nmo_k, nk * nmo_k, occ_k, nmo_pk.origin(), nmo_pk0.origin(), nel_pk.origin(),
                  nel_pk0.origin(), KaKjw.origin(), KKwaj.origin());
   // Reference values from python
   REQUIRE(KKwaj[1][2][3][4][4] == Approx(1473.0));
@@ -127,7 +127,7 @@ TEST_CASE("KaKjw_to_QKwaj", "[Numerics][tensor_operations]")
   Tensor2D<int> qk_to_k2 = {{0, 1, 2, 3}, {1, 0, 3, 2}, {2, 3, 0, 1}, {3, 2, 1, 0}};
   copy_n(buffer.data(), buffer.size(), KaKjw.origin());
   using ma::KaKjw_to_QKajw;
-  KaKjw_to_QKajw(nwalk, nk, nmo_k, nk * nmo_k, occ_k, nmo_pk.origin(), nmo_pk0.origin(), nel_pk.origin(),
+  KaKjw_to_QKajw(nwalk, nk, 1, nmo_k, nk * nmo_k, occ_k, nmo_pk.origin(), nmo_pk0.origin(), nel_pk.origin(),
                  nel_pk0.origin(), qk_to_k2.origin(), KaKjw.origin(), QKajw.origin());
   // Just captured from output.
   REQUIRE(QKajw[1][2][3][4][4] == Approx(1904.0));
diff --git a/src/AFQMC/SlaterDeterminantOperations/SlaterDetOperations_serial.hpp b/src/AFQMC/SlaterDeterminantOperations/SlaterDetOperations_serial.hpp
index 73ff2e0ba..bc43967aa 100644
--- a/src/AFQMC/SlaterDeterminantOperations/SlaterDetOperations_serial.hpp
+++ b/src/AFQMC/SlaterDeterminantOperations/SlaterDetOperations_serial.hpp
@@ -180,7 +180,7 @@ public:
       TTensor_ref TMN_( TMN.origin(), {npol*nbatch, M, NAEA}); 
       TTensor_ref T1_( T1.origin(), {npol*nbatch, M, NAEA}); 
       TTensor_ref T2_( T2.origin(), {npol*nbatch, M, NAEA}); 
-      SlaterDeterminantOperations::batched::apply_expM(V, TMN_, T1_, T2_, order, TA);
+      SlaterDeterminantOperations::batched::apply_expM_noncollinear(V, TMN_, T1_, T2_, order, TA);
     } else {
       SlaterDeterminantOperations::batched::apply_expM(V, TMN, T1, T2, order, TA);
     } 
diff --git a/src/AFQMC/SlaterDeterminantOperations/apply_expM.hpp b/src/AFQMC/SlaterDeterminantOperations/apply_expM.hpp
index ea1dd664b..8a706c811 100644
--- a/src/AFQMC/SlaterDeterminantOperations/apply_expM.hpp
+++ b/src/AFQMC/SlaterDeterminantOperations/apply_expM.hpp
@@ -188,6 +188,90 @@ inline void apply_expM(const MatA& V, MatB&& S, MatC& T1, MatC& T2,
   }
 }
 
+/*
+ * Calculate S = exp(im*V)*S using a Taylor expansion of exp(V)
+ * Version for non_collinear calculations, where there are 2 S matrices per V in the batch.
+ */
+template<class MatA, class MatB, class MatC>
+inline void apply_expM_noncollinear(const MatA& V, MatB&& S, MatC& T1, MatC& T2, 
+                       int order = 6, char TA = 'N')
+{
+  static_assert(std::decay<MatA>::type::dimensionality == 3, " batched::apply_expM::dimenionality == 3");
+  static_assert(std::decay<MatB>::type::dimensionality == 3, " batched::apply_expM::dimenionality == 3");
+  static_assert(std::decay<MatC>::type::dimensionality == 3, " batched::apply_expM::dimenionality == 3");
+  assert(V.size(0)*2 == S.size(0)); 
+  assert(V.size(0)*2 == T1.size(0)); 
+  assert(V.size(0)*2 == T2.size(0));
+  assert(V.size(1) == V.size(2));
+  assert(V.size(2) == S.size(1));
+  assert(S.size(1) == T1.size(1));
+  assert(S.size(2) == T1.size(2));
+  assert(S.size(1) == T2.size(1));
+  assert(S.size(2) == T2.size(2));
+  // for now limit to continuous
+  assert(S.stride(0) == S.size(1) * S.size(2));
+  assert(T1.stride(0) == T1.size(1) * T1.size(2));
+  assert(T2.stride(0) == T2.size(1) * T2.size(2));
+  assert(S.stride(1) == S.size(2));
+  assert(T1.stride(1) == T1.size(2));
+  assert(T2.stride(1) == T2.size(2));
+  assert(S.stride(2) == 1);
+  assert(T1.stride(2) == 1);
+  assert(T2.stride(2) == 1);
+
+  using ComplexType = typename std::decay<MatB>::type::element;
+  ComplexType zero(0.);
+  ComplexType im(0.0, 1.0);
+  if (TA == 'H' || TA == 'h')
+    im = ComplexType(0.0, -1.0);
+  auto pT1(std::addressof(T1));
+  auto pT2(std::addressof(T2));
+
+  using pointerA = typename std::decay<MatA>::type::element_const_ptr; 
+  using pointerC = typename std::decay<MatC>::type::element_ptr; 
+
+  int nbatch = S.size(0);
+  int ldv = V.stride(1); 
+  int M = T2.size(2); 
+  int N = T2.size(1);
+  int K = T1.size(1);
+
+  std::vector<pointerA> Vi;
+  std::vector<pointerC> T1i;
+  std::vector<pointerC> T2i;
+  Vi.reserve(2*V.size(0));  
+  T1i.reserve(T1.size(0));  
+  T2i.reserve(T2.size(0));  
+  for (int i = 0; i < V.size(0); i++) { 
+    Vi.emplace_back(ma::pointer_dispatch(V[i].origin()));
+    Vi.emplace_back(ma::pointer_dispatch(V[i].origin()));
+  }  
+  for (int i = 0; i < T1.size(0); i++)
+    T1i.emplace_back(ma::pointer_dispatch(T1[i].origin()));
+  for (int i = 0; i < T2.size(0); i++)
+    T2i.emplace_back(ma::pointer_dispatch(T2[i].origin()));    
+
+  auto pT1i(std::addressof(T1i));
+  auto pT2i(std::addressof(T2i));
+
+  // getting around issue in multi, fix later
+  //T1 = S;
+  using std::copy_n;
+  copy_n(S.origin(), S.num_elements(), T1.origin());
+  for (int n = 1; n <= order; n++)
+  {
+    ComplexType fact = im * static_cast<ComplexType>(1.0 / static_cast<double>(n));
+    using ma::gemmBatched;
+    // careful with fortran ordering
+    gemmBatched('N',TA,M,N,K,fact, pT1i->data(), (*pT1).stride(1), Vi.data(), ldv, 
+                             zero, pT2i->data(), (*pT2).stride(1), nbatch);
+    using ma::axpy;
+    axpy(S.num_elements(), ComplexType(1.0), (*pT2).origin(), 1, S.origin(), 1);
+    std::swap(pT1, pT2);
+    std::swap(pT1i, pT2i);
+  }
+}
+
 } // namespace batched
 
 } // namespace SlaterDeterminantOperations
diff --git a/src/AFQMC/SlaterDeterminantOperations/rotate.hpp b/src/AFQMC/SlaterDeterminantOperations/rotate.hpp
index cdf85807d..2c04324eb 100644
--- a/src/AFQMC/SlaterDeterminantOperations/rotate.hpp
+++ b/src/AFQMC/SlaterDeterminantOperations/rotate.hpp
@@ -779,7 +779,7 @@ void getLakn_Lank_from_Lkin(MultiArray2DA&& Aai,
   // Lakn[a][sk][n] = sum_i Aai[as][i] conj(Lkin[k][i][n])
   for (int k = 0; k < nmo; k++)
   {
-    ma::product(ma::H(Lkin[k].sliced(0, ni)), ma::T(Aai), bnas);
+    ma::product(ma::H(Lkin[k].sliced(0, ni)), ma::T(Aas_i), bnas);
     for (int a = 0; a < na; a++)
       for (int p = 0; p < npol; p++)
         Lakn[a][p*nmo+k] = bnas({0, nchol}, a*npol+p);