From c324dfb421be696267081560073f2f1833cbe520 Mon Sep 17 00:00:00 2001
From: dyzheng <zhengdy@bjaisi.com>
Date: Mon, 2 Sep 2024 16:07:23 +0800
Subject: [PATCH] Feature: add stress for DFT+U and DeltaSpin in PW code

---
 .../hamilt_pwdft/CMakeLists.txt               |   1 +
 .../hamilt_pwdft/forces_onsite.cpp            |   5 -
 .../hamilt_pwdft/fs_nonlocal_tools.cpp        | 457 +++++++++++-------
 .../hamilt_pwdft/fs_nonlocal_tools.h          |   7 +-
 .../hamilt_pwdft/kernels/stress_op.cpp        |  44 +-
 .../hamilt_pwdft/kernels/stress_op.h          |   4 -
 .../hamilt_pwdft/stress_func.h                |  26 +-
 .../hamilt_pwdft/stress_func_onsite.cpp       | 113 +++++
 .../hamilt_pwdft/stress_pw.cpp                |  16 +-
 9 files changed, 452 insertions(+), 221 deletions(-)
 create mode 100644 source/module_hamilt_pw/hamilt_pwdft/stress_func_onsite.cpp
diff --git a/source/module_hamilt_pw/hamilt_pwdft/CMakeLists.txt b/source/module_hamilt_pw/hamilt_pwdft/CMakeLists.txt
index b85287fe2f..206f434d82 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/CMakeLists.txt
+++ b/source/module_hamilt_pw/hamilt_pwdft/CMakeLists.txt
@@ -22,6 +22,7 @@ list(APPEND objects
     stress_func_kin.cpp
     stress_func_loc.cpp
     stress_func_nl.cpp
+    stress_func_onsite.cpp
     stress_func_us.cpp
     stress_pw.cpp
     VL_in_pw.cpp
diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces_onsite.cpp b/source/module_hamilt_pw/hamilt_pwdft/forces_onsite.cpp
index 99668cd78c..2c2d710848 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/forces_onsite.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/forces_onsite.cpp
@@ -7,11 +7,6 @@
 #include "module_hamilt_lcao/module_dftu/dftu.h"
 #include "module_hamilt_lcao/module_deltaspin/spin_constrain.h"
 
-
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-
 template <typename FPTYPE, typename Device>
 void Forces<FPTYPE, Device>::cal_force_onsite(ModuleBase::matrix& force_onsite,
                                           const ModuleBase::matrix& wg,
diff --git a/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp b/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp
index ae668ce811..b6e62fec68 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp
@@ -7,6 +7,7 @@
 #include "module_base/tool_title.h"
 #include "module_hamilt_pw/hamilt_pwdft/kernels/force_op.h"
 #include "nonlocal_maths.hpp"
+
 #include <numeric>
 
 namespace hamilt
@@ -27,7 +28,7 @@ FS_Nonlocal_tools<FPTYPE, Device>::FS_Nonlocal_tools(const pseudopot_cell_vnl* n
 
     // seems kvec_c never used...
     this->kvec_c = this->wfc_basis_->template get_kvec_c_data<FPTYPE>();
-    // the following is important for calculating the whole contribution to 
+    // the following is important for calculating the whole contribution to
     // Hamiltonian or force, stress: sum{nk} fnk*sum_{ij}<psi_nk|ai>Dij<aj|psi_nk>
     // among, Dij is deeq.
     // For DFT+U and other projection involved operators, deeq also plays.
@@ -68,22 +69,24 @@ FS_Nonlocal_tools<FPTYPE, Device>::FS_Nonlocal_tools(const pseudopot_cell_vnl* n
     }
     // allocate memory
     this->allocate_memory(wg, ekb, this->nproj, nch);
-    this->ppcell_vkb = (this->device == base_device::GpuDevice)? this->nlpp_->template get_vkb_data<FPTYPE>() : this->nlpp_->vkb.c;
+    this->ppcell_vkb
+        = (this->device == base_device::GpuDevice) ? this->nlpp_->template get_vkb_data<FPTYPE>() : this->nlpp_->vkb.c;
 }
 
 template <typename FPTYPE, typename Device>
-FS_Nonlocal_tools<FPTYPE, Device>::FS_Nonlocal_tools(const std::vector<int>& nproj,     // number of projectors for each atom type
-                                                     const std::vector<int>& lproj,
-                                                     const ModuleBase::realArray& tab,  // radials' spherical bessel transform
-                                                     const ModuleBase::matrix& nhtol,   // (it, ich) -> l, the ich is (l, m)-distinctive index
-                                                     std::complex<FPTYPE>* vkb_buf, // the vkb buffer
-                                                     const UnitCell* ucell_in,
-                                                     const psi::Psi<std::complex<FPTYPE>, Device>* psi_in,
-                                                     const K_Vectors* kv_in,
-                                                     const ModulePW::PW_Basis_K* wfc_basis_in,
-                                                     const Structure_Factor* sf_in,
-                                                     const ModuleBase::matrix& wg,
-                                                     const ModuleBase::matrix& ekb)
+FS_Nonlocal_tools<FPTYPE, Device>::FS_Nonlocal_tools(
+    const std::vector<int>& nproj, // number of projectors for each atom type
+    const std::vector<int>& lproj,
+    const ModuleBase::realArray& tab, // radials' spherical bessel transform
+    const ModuleBase::matrix& nhtol,  // (it, ich) -> l, the ich is (l, m)-distinctive index
+    std::complex<FPTYPE>* vkb_buf,    // the vkb buffer
+    const UnitCell* ucell_in,
+    const psi::Psi<std::complex<FPTYPE>, Device>* psi_in,
+    const K_Vectors* kv_in,
+    const ModulePW::PW_Basis_K* wfc_basis_in,
+    const Structure_Factor* sf_in,
+    const ModuleBase::matrix& wg,
+    const ModuleBase::matrix& ekb)
 {
     // this is a constructor for general case, including vnl, dftu, deltaspin, deepks, etc.
     // what is needed for this kind of constructor?
@@ -95,7 +98,6 @@ FS_Nonlocal_tools<FPTYPE, Device>::FS_Nonlocal_tools(const std::vector<int>& npr
     // rgrid: radial grid
     // deeq: the Dij matrix, Hubbard parameters or other quantities...
 
-
     // what are already programmed to be needed?
 
     // tab: the spherical transform of radial functions, with q = linspace(0, GlobalV::NQX, GlobalV::DQ)
@@ -105,15 +107,14 @@ FS_Nonlocal_tools<FPTYPE, Device>::FS_Nonlocal_tools(const std::vector<int>& npr
     // h_atom_nh: counterpart of atom_nh on host
     // max_nh: std::max_element(atom_nh.begin(), atom_nh.end())
 
-
     // in conclusion, this constructor needs the following individual information:
-    
-    // nproj 
+
+    // nproj
     // tab (projs is not needed, should be calculated elsewhere)
-    // lproj 
+    // lproj
     // deeq, with its dims. it will be good to pass the whole realarray
 
-    // what can be built here 
+    // what can be built here
     // nhtol
     // nkb
     // atom_nh, h_atom_nh, max_nh
@@ -165,7 +166,7 @@ FS_Nonlocal_tools<FPTYPE, Device>::~FS_Nonlocal_tools()
 }
 
 template <typename FPTYPE, typename Device>
-void FS_Nonlocal_tools<FPTYPE, Device>::allocate_memory(const ModuleBase::matrix& wg, 
+void FS_Nonlocal_tools<FPTYPE, Device>::allocate_memory(const ModuleBase::matrix& wg,
                                                         const ModuleBase::matrix& ekb,
                                                         const std::vector<int>& nproj,
                                                         const std::vector<int>& nch)
@@ -274,22 +275,25 @@ void FS_Nonlocal_tools<FPTYPE, Device>::delete_memory()
 // . the multiplication with sk should be within specific operator
 // because the atom selection task is operator-specific.
 template <typename FPTYPE, typename Device>
-void FS_Nonlocal_tools<FPTYPE, Device>::cal_becp(int ik, int npm, std::complex<FPTYPE>* becp_in, const std::complex<FPTYPE>* ppsi_in)
+void FS_Nonlocal_tools<FPTYPE, Device>::cal_becp(int ik,
+                                                 int npm,
+                                                 std::complex<FPTYPE>* becp_in,
+                                                 const std::complex<FPTYPE>* ppsi_in)
 {
     ModuleBase::TITLE("FS_Nonlocal_tools", "cal_becp");
     ModuleBase::timer::tick("FS_Nonlocal_tools", "cal_becp");
-        
+
     const int npol = this->ucell_->get_npol();
-    const std::complex<FPTYPE>* ppsi = ppsi_in==nullptr? &(this->psi_[0](ik, 0, 0)) : ppsi_in;
+    const std::complex<FPTYPE>* ppsi = ppsi_in == nullptr ? &(this->psi_[0](ik, 0, 0)) : ppsi_in;
     const int npw = this->wfc_basis_->npwk[ik];
-    if(becp_in == nullptr && this->becp == nullptr)
+    if (becp_in == nullptr && this->becp == nullptr)
     {
         resmem_complex_op()(this->ctx, becp, this->nbands * npol * this->nkb);
     }
-    std::complex<FPTYPE>* becp_tmp = becp_in==nullptr? this->becp : becp_in;
+    std::complex<FPTYPE>* becp_tmp = becp_in == nullptr ? this->becp : becp_in;
     const int size_becp_act = npm * npol * this->nkb;
-    if(ik != this->current_ik)// different ik, need to recalculate vkb
-    { 
+    if (ik != this->current_ik) // different ik, need to recalculate vkb
+    {
         const int size_becp = this->nbands * npol * this->nkb;
         if (this->becp == nullptr)
         {
@@ -363,7 +367,7 @@ void FS_Nonlocal_tools<FPTYPE, Device>::cal_becp(int ik, int npm, std::complex<F
                         GlobalV::DQ,
                         nproj[it],
                         hd_vq); // hd_vq has dimension (nprojmax, npwx), this size will be the largest needed.
-            
+
             // DEBUG: ONCE YOU CHECK vq VALUES, YOU UNCOMMENT THE FOLLOWING LINE
             // for(int ip = 0; ip < nproj[it]; ip++)
             // {
@@ -375,22 +379,15 @@ void FS_Nonlocal_tools<FPTYPE, Device>::cal_becp(int ik, int npm, std::complex<F
             //     std::cout << std::endl;
             // }
             // std::cout << std::endl;
-            
+
             // prepare（-i）^l, size: nh
             std::vector<std::complex<double>> pref = maths.cal_pref(it, h_atom_nh[it]);
             const int nh = pref.size();
             this->dvkb_indexes.resize(nh * 4);
             // print the value of nhtol
             // nhtol->print(std::cout); // as checked, nhtol works as expected
-            maths.cal_dvkb_index(nproj[it],
-                                this->nhtol->c,
-                                this->nhtol->nc,
-                                npw,
-                                it,
-                                0,
-                                0,
-                                this->dvkb_indexes.data());
-            
+            maths.cal_dvkb_index(nproj[it], this->nhtol->c, this->nhtol->nc, npw, it, 0, 0, this->dvkb_indexes.data());
+
             if (this->device == base_device::GpuDevice)
             {
                 syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, d_dvkb_indexes, dvkb_indexes.data(), nh * 4);
@@ -443,7 +440,6 @@ void FS_Nonlocal_tools<FPTYPE, Device>::cal_becp(int ik, int npm, std::complex<F
               becp_tmp,
               this->nkb);
 
-
     if (this->device == base_device::GpuDevice)
     {
         std::complex<FPTYPE>* h_becp = nullptr;
@@ -595,54 +591,57 @@ void FS_Nonlocal_tools<FPTYPE, Device>::cal_dbecp_s(int ik, int npm, int ipol, i
               dbecp,
               nkb);
     // calculate stress for target (ipol, jpol)
-    if(npol == 1)
+    if (stress != nullptr)
     {
-    const int current_spin = this->kv_->isk[ik];
-    cal_stress_nl_op()(this->ctx,
-                       nondiagonal,
-                       ipol,
-                       jpol,
-                       nkb,
-                       npm,
-                       this->ntype,
-                       current_spin, // uspp only
-                       this->nbands,
-                       ik,
-                       this->deeq_dims[1],
-                       this->deeq_dims[2],
-                       this->deeq_dims[3],
-                       atom_nh,
-                       atom_na,
-                       d_wg,
-                       d_ekb,
-                       qq_nt,
-                       deeq,
-                       becp,
-                       dbecp,
-                       stress);
-    }
-    else
-    {
-        cal_stress_nl_op()(this->ctx,
-                           ipol,
-                           jpol,
-                           nkb,
-                           npm,
-                           this->ntype,
-                           this->nbands,
-                           ik,
-                           this->deeq_nc_dims[1],
-                           this->deeq_nc_dims[2],
-                           this->deeq_nc_dims[3],
-                           atom_nh,
-                           atom_na,
-                           d_wg,
-                           d_ekb,
-                           qq_nt,
-                           this->deeq_nc,
-                           becp,
-                           dbecp,
-                           stress);
+        if (npol == 1)
+        {
+            const int current_spin = this->kv_->isk[ik];
+            cal_stress_nl_op()(this->ctx,
+                               nondiagonal,
+                               ipol,
+                               jpol,
+                               nkb,
+                               npm,
+                               this->ntype,
+                               current_spin, // uspp only
+                               this->nbands,
+                               ik,
+                               this->deeq_dims[1],
+                               this->deeq_dims[2],
+                               this->deeq_dims[3],
+                               atom_nh,
+                               atom_na,
+                               d_wg,
+                               d_ekb,
+                               qq_nt,
+                               deeq,
+                               becp,
+                               dbecp,
+                               stress);
+        }
+        else
+        {
+            cal_stress_nl_op()(this->ctx,
+                               ipol,
+                               jpol,
+                               nkb,
+                               npm,
+                               this->ntype,
+                               this->nbands,
+                               ik,
+                               this->deeq_nc_dims[1],
+                               this->deeq_nc_dims[2],
+                               this->deeq_nc_dims[3],
+                               atom_nh,
+                               atom_na,
+                               d_wg,
+                               d_ekb,
+                               qq_nt,
+                               this->deeq_nc,
+                               becp,
+                               dbecp,
+                               stress);
+        }
     }
     ModuleBase::timer::tick("FS_Nonlocal_tools", "cal_dbecp_s");
 }
@@ -700,7 +699,7 @@ void FS_Nonlocal_tools<FPTYPE, Device>::cal_dbecp_f(int ik, int npm, int ipol)
     const int npm_npol = npm * npol;
     const int size_becp = this->nbands * npol * this->nkb;
     if (this->dbecp == nullptr) // if it is the very first run, we allocate
-    { // why not judging whether dbecp == nullptr inside resmem_complex_op?
+    {                           // why not judging whether dbecp == nullptr inside resmem_complex_op?
         resmem_complex_op()(this->ctx, dbecp, 3 * size_becp);
     }
     // do gemm to get dbecp and revert the ppcell_vkb for next ipol
@@ -806,7 +805,8 @@ template <typename FPTYPE, typename Device>
 void FS_Nonlocal_tools<FPTYPE, Device>::transfer_gcar(int npw, int npw_max, const FPTYPE* gcar_in)
 {
     std::vector<FPTYPE> gcar_tmp(3 * npw_max); // [out], will overwritten this->gcar
-    gcar_tmp.assign(gcar_in, gcar_in + 3 * npw_max); // UNDEFINED BEHAVIOR!!! nobody always knows the memory layout of vector3
+    gcar_tmp.assign(gcar_in,
+                    gcar_in + 3 * npw_max); // UNDEFINED BEHAVIOR!!! nobody always knows the memory layout of vector3
     std::vector<int> gcar_zero_indexes_tmp(3 * npw_max); // a "checklist"
 
     int* gcar_zero_ptrs[3];
@@ -865,41 +865,17 @@ void FS_Nonlocal_tools<FPTYPE, Device>::cal_force(int ik, int npm, FPTYPE* force
     const int current_spin = this->kv_->isk[ik];
     const int force_nc = 3;
     // calculate the force
-    if(this->ucell_->get_npol() == 1)
-    {
-        cal_force_nl_op<FPTYPE, Device>()(this->ctx,
-                                        nondiagonal,
-                                        npm,
-                                        this->nbands,
-                                        this->ntype,
-                                        current_spin,
-                                        this->deeq_dims[1],
-                                        this->deeq_dims[2],
-                                        this->deeq_dims[3],
-                                        force_nc,
-                                        this->nbands,
-                                        ik,
-                                        nkb,
-                                        atom_nh,
-                                        atom_na,
-                                        this->ucell_->tpiba,
-                                        d_wg,
-                                        d_ekb,
-                                        qq_nt,
-                                        deeq,
-                                        becp,
-                                        dbecp,
-                                        force);
-    }
-    else
+    if (this->ucell_->get_npol() == 1)
     {
         cal_force_nl_op<FPTYPE, Device>()(this->ctx,
+                                          nondiagonal,
                                           npm,
                                           this->nbands,
                                           this->ntype,
-                                          this->deeq_nc_dims[1],
-                                          this->deeq_nc_dims[2],
-                                          this->deeq_nc_dims[3],
+                                          current_spin,
+                                          this->deeq_dims[1],
+                                          this->deeq_dims[2],
+                                          this->deeq_dims[3],
                                           force_nc,
                                           this->nbands,
                                           ik,
@@ -910,31 +886,20 @@ void FS_Nonlocal_tools<FPTYPE, Device>::cal_force(int ik, int npm, FPTYPE* force
                                           d_wg,
                                           d_ekb,
                                           qq_nt,
-                                          this->deeq_nc,
+                                          deeq,
                                           becp,
                                           dbecp,
                                           force);
     }
-}
-
-template <typename FPTYPE, typename Device>
-void FS_Nonlocal_tools<FPTYPE, Device>::cal_force_dftu(int ik, int npm, FPTYPE* force, const int* orbital_corr, const std::complex<FPTYPE>* vu)
-{
-            int* orbital_corr_tmp = const_cast<int*>(orbital_corr);
-            std::complex<FPTYPE>* vu_tmp = const_cast<std::complex<FPTYPE>*>(vu);
-#if defined(__CUDA) || defined(__ROCM)
-            if (this->device == "gpu") {
-                resmem_int_op()(gpu_ctx, orbital_corr_tmp, this->ucell_->ntype);
-                syncmem_int_h2d_op()(gpu_ctx, cpu_ctx, orbital_corr_tmp, dftu->orbital_corr, this->ucell_->ntype);
-                resmem_cd_op()(gpu_ctx, vu_tmp, dftu->get_size_eff_pot_pw());
-                syncmem_cd_h2d_op()(gpu_ctx, cpu_ctx, vu_tmp, dftu->get_eff_pot_pw(0), dftu->get_size_eff_pot_pw());
-            }
-#endif
-            const int force_nc = 3;
-            cal_force_nl_op<FPTYPE, Device>()(this->ctx,
+    else
+    {
+        cal_force_nl_op<FPTYPE, Device>()(this->ctx,
                                           npm,
                                           this->nbands,
                                           this->ntype,
+                                          this->deeq_nc_dims[1],
+                                          this->deeq_nc_dims[2],
+                                          this->deeq_nc_dims[3],
                                           force_nc,
                                           this->nbands,
                                           ik,
@@ -943,60 +908,193 @@ void FS_Nonlocal_tools<FPTYPE, Device>::cal_force_dftu(int ik, int npm, FPTYPE*
                                           atom_na,
                                           this->ucell_->tpiba,
                                           d_wg,
-                                          vu_tmp,
-                                          orbital_corr_tmp,
+                                          d_ekb,
+                                          qq_nt,
+                                          this->deeq_nc,
                                           becp,
                                           dbecp,
                                           force);
+    }
+}
+
+template <typename FPTYPE, typename Device>
+void FS_Nonlocal_tools<FPTYPE, Device>::cal_force_dftu(int ik,
+                                                       int npm,
+                                                       FPTYPE* force,
+                                                       const int* orbital_corr,
+                                                       const std::complex<FPTYPE>* vu)
+{
+    int* orbital_corr_tmp = const_cast<int*>(orbital_corr);
+    std::complex<FPTYPE>* vu_tmp = const_cast<std::complex<FPTYPE>*>(vu);
 #if defined(__CUDA) || defined(__ROCM)
-            if (this->device == "gpu") {
-                delmem_cd_op()(gpu_ctx, vu_tmp);
-                delmem_int_op()(gpu_ctx, orbital_corr_tmp);
-            }
+    if (this->device == "gpu")
+    {
+        resmem_int_op()(gpu_ctx, orbital_corr_tmp, this->ucell_->ntype);
+        syncmem_int_h2d_op()(gpu_ctx, cpu_ctx, orbital_corr_tmp, dftu->orbital_corr, this->ucell_->ntype);
+        resmem_cd_op()(gpu_ctx, vu_tmp, dftu->get_size_eff_pot_pw());
+        syncmem_cd_h2d_op()(gpu_ctx, cpu_ctx, vu_tmp, dftu->get_eff_pot_pw(0), dftu->get_size_eff_pot_pw());
+    }
+#endif
+    const int force_nc = 3;
+    cal_force_nl_op<FPTYPE, Device>()(this->ctx,
+                                      npm,
+                                      this->nbands,
+                                      this->ntype,
+                                      force_nc,
+                                      this->nbands,
+                                      ik,
+                                      nkb,
+                                      atom_nh,
+                                      atom_na,
+                                      this->ucell_->tpiba,
+                                      d_wg,
+                                      vu_tmp,
+                                      orbital_corr_tmp,
+                                      becp,
+                                      dbecp,
+                                      force);
+#if defined(__CUDA) || defined(__ROCM)
+    if (this->device == "gpu")
+    {
+        delmem_cd_op()(gpu_ctx, vu_tmp);
+        delmem_int_op()(gpu_ctx, orbital_corr_tmp);
+    }
 #endif
 }
 
 template <typename FPTYPE, typename Device>
-void FS_Nonlocal_tools<FPTYPE, Device>::cal_force_dspin(int ik, int npm, FPTYPE* force, const ModuleBase::Vector3<double>* lambda)
+void FS_Nonlocal_tools<FPTYPE, Device>::cal_force_dspin(int ik,
+                                                        int npm,
+                                                        FPTYPE* force,
+                                                        const ModuleBase::Vector3<double>* lambda)
 {
-            FPTYPE* lambda_array = nullptr;
-            resmem_var_op()(this->cpu_ctx, lambda_array, this->ucell_->nat*3);
-            for(int iat = 0; iat < this->ucell_->nat; iat++)
-            {
-                lambda_array[iat*3] = lambda[iat].x;
-                lambda_array[iat*3+1] = lambda[iat].y;
-                lambda_array[iat*3+2] = lambda[iat].z;
-            }
-            FPTYPE* lambda_tmp = lambda_array;
+    FPTYPE* lambda_array = nullptr;
+    resmem_var_op()(this->cpu_ctx, lambda_array, this->ucell_->nat * 3);
+    for (int iat = 0; iat < this->ucell_->nat; iat++)
+    {
+        lambda_array[iat * 3] = lambda[iat].x;
+        lambda_array[iat * 3 + 1] = lambda[iat].y;
+        lambda_array[iat * 3 + 2] = lambda[iat].z;
+    }
+    FPTYPE* lambda_tmp = lambda_array;
 #if defined(__CUDA) || defined(__ROCM)
-            if (this->device == "gpu") {
-                resmem_var_op()(this->ctx, lambda_tmp, this->ucell_->nat*3);
-                syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, lambda_tmp, lambda_array, this->ucell_->nat*3);
-            }
+    if (this->device == "gpu")
+    {
+        resmem_var_op()(this->ctx, lambda_tmp, this->ucell_->nat * 3);
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, lambda_tmp, lambda_array, this->ucell_->nat * 3);
+    }
 #endif
-            const int force_nc = 3;
-            cal_force_nl_op<FPTYPE, Device>()(this->ctx,
-                                          npm,
-                                          this->nbands,
-                                          this->ntype,
-                                          force_nc,
-                                          this->nbands,
-                                          ik,
-                                          nkb,
-                                          atom_nh,
-                                          atom_na,
-                                          this->ucell_->tpiba,
-                                          d_wg,
-                                          lambda_tmp,
-                                          becp,
-                                          dbecp,
-                                          force);
+    const int force_nc = 3;
+    cal_force_nl_op<FPTYPE, Device>()(this->ctx,
+                                      npm,
+                                      this->nbands,
+                                      this->ntype,
+                                      force_nc,
+                                      this->nbands,
+                                      ik,
+                                      nkb,
+                                      atom_nh,
+                                      atom_na,
+                                      this->ucell_->tpiba,
+                                      d_wg,
+                                      lambda_tmp,
+                                      becp,
+                                      dbecp,
+                                      force);
+
+    delmem_var_op()(this->ctx, lambda_array);
+#if defined(__CUDA) || defined(__ROCM)
+    if (this->device == "gpu")
+    {
+        delmem_var_op()(this->cpu_ctx, lambda_tmp);
+    }
+#endif
+}
 
-            delmem_var_op()(this->ctx, lambda_array);
+template <typename FPTYPE, typename Device>
+void FS_Nonlocal_tools<FPTYPE, Device>::cal_stress_dftu(int ik,
+                                                        int npm,
+                                                        FPTYPE* stress,
+                                                        const int* orbital_corr,
+                                                        const std::complex<FPTYPE>* vu)
+{
+    int* orbital_corr_tmp = const_cast<int*>(orbital_corr);
+    std::complex<FPTYPE>* vu_tmp = const_cast<std::complex<FPTYPE>*>(vu);
 #if defined(__CUDA) || defined(__ROCM)
-            if (this->device == "gpu") {
-                delmem_var_op()(this->cpu_ctx, lambda_tmp);
-            }
+    if (this->device == "gpu")
+    {
+        resmem_int_op()(gpu_ctx, orbital_corr_tmp, this->ucell_->ntype);
+        syncmem_int_h2d_op()(gpu_ctx, cpu_ctx, orbital_corr_tmp, dftu->orbital_corr, this->ucell_->ntype);
+        resmem_cd_op()(gpu_ctx, vu_tmp, dftu->get_size_eff_pot_pw());
+        syncmem_cd_h2d_op()(gpu_ctx, cpu_ctx, vu_tmp, dftu->get_eff_pot_pw(0), dftu->get_size_eff_pot_pw());
+    }
+#endif
+    cal_stress_nl_op()(this->ctx,
+                       nkb,
+                       npm,
+                       this->ntype,
+                       this->nbands,
+                       ik,
+                       atom_nh,
+                       atom_na,
+                       d_wg,
+                       vu_tmp,
+                       orbital_corr_tmp,
+                       becp,
+                       dbecp,
+                       stress);
+#if defined(__CUDA) || defined(__ROCM)
+    if (this->device == "gpu")
+    {
+        delmem_cd_op()(gpu_ctx, vu_tmp);
+        delmem_int_op()(gpu_ctx, orbital_corr_tmp);
+    }
+#endif
+}
+
+template <typename FPTYPE, typename Device>
+void FS_Nonlocal_tools<FPTYPE, Device>::cal_stress_dspin(int ik,
+                                                         int npm,
+                                                         FPTYPE* stress,
+                                                         const ModuleBase::Vector3<double>* lambda)
+{
+    FPTYPE* lambda_array = nullptr;
+    resmem_var_op()(this->cpu_ctx, lambda_array, this->ucell_->nat * 3);
+    for (int iat = 0; iat < this->ucell_->nat; iat++)
+    {
+        lambda_array[iat * 3] = lambda[iat].x;
+        lambda_array[iat * 3 + 1] = lambda[iat].y;
+        lambda_array[iat * 3 + 2] = lambda[iat].z;
+    }
+    FPTYPE* lambda_tmp = lambda_array;
+#if defined(__CUDA) || defined(__ROCM)
+    if (this->device == "gpu")
+    {
+        resmem_var_op()(this->ctx, lambda_tmp, this->ucell_->nat * 3);
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, lambda_tmp, lambda_array, this->ucell_->nat * 3);
+    }
+#endif
+    const int force_nc = 3;
+    cal_stress_nl_op()(this->ctx,
+                       nkb,
+                       npm,
+                       this->ntype,
+                       this->nbands,
+                       ik,
+                       atom_nh,
+                       atom_na,
+                       d_wg,
+                       lambda_tmp,
+                       becp,
+                       dbecp,
+                       stress);
+
+    delmem_var_op()(this->ctx, lambda_array);
+#if defined(__CUDA) || defined(__ROCM)
+    if (this->device == "gpu")
+    {
+        delmem_var_op()(this->cpu_ctx, lambda_tmp);
+    }
 #endif
 }
 
@@ -1006,7 +1104,4 @@ template class FS_Nonlocal_tools<double, base_device::DEVICE_CPU>;
 template class FS_Nonlocal_tools<double, base_device::DEVICE_GPU>;
 #endif
 
-
-
-
 } // namespace hamilt
diff --git a/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.h b/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.h
index f836560ddd..658d8e2281 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.h
+++ b/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.h
@@ -63,7 +63,7 @@ class FS_Nonlocal_tools
      * @brief calculate the dbecp_{ij} = <psi|\partial beta/\partial varepsilon_{ij}> for all beta functions
      *       stress_{ij} = -1/omega \sum_{n,k}f_{nk} \sum_I \sum_{lm,l'm'}D_{l,l'}^{I} becp * dbecp_{ij} also calculated
      */
-    void cal_dbecp_s(int ik, int npm, int ipol, int jpol, FPTYPE* stress);
+    void cal_dbecp_s(int ik, int npm, int ipol, int jpol, FPTYPE* stress = nullptr);
     /**
      * @brief calculate the dbecp_i = <psi|\partial beta/\partial \tau^I_i> for all beta functions
      */
@@ -75,7 +75,10 @@ class FS_Nonlocal_tools
 
     void cal_force_dftu(int ik, int npm, FPTYPE* force, const int* orbital_corr, const std::complex<FPTYPE>* vu);
     void cal_force_dspin(int ik, int npm, FPTYPE* force, const ModuleBase::Vector3<double>* lambda);
-    
+    void cal_stress_dftu(int ik, int npm, FPTYPE* stress, const int* orbital_corr, const std::complex<FPTYPE>* vu);
+    void cal_stress_dspin(int ik, int npm, FPTYPE* stress, const ModuleBase::Vector3<double>* lambda);
+
+
     std::complex<FPTYPE>* get_becp() { return becp; }
     std::complex<FPTYPE>* get_dbecp() { return dbecp; }
 
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp
index 70e3311339..aeae2acbf9 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp
@@ -211,8 +211,6 @@ struct cal_stress_nl_op<FPTYPE, base_device::DEVICE_CPU>
     };
     // kernel for DFT+U
     void operator()(const base_device::DEVICE_CPU* ctx,
-                    const int& ipol,
-                    const int& jpol,
                     const int& nkb,
                     const int& nbands_occ,
                     const int& ntype,
@@ -242,12 +240,12 @@ struct cal_stress_nl_op<FPTYPE, base_device::DEVICE_CPU>
             const int ip_end = (orbital_l + 1) * (orbital_l + 1);
             const int tlp1 = 2 * orbital_l + 1;
             const int tlp1_2 = tlp1 * tlp1;
-            for (int ib = 0; ib < nbands_occ; ib++)
+            for (int ia = 0; ia < atom_na[it]; ia++)
             {
-                const int ib2 = ib*2;
-                FPTYPE fac = d_wg[ik * wg_nc + ib];
-                for (int ia = 0; ia < atom_na[it]; ia++)
+                for (int ib = 0; ib < nbands_occ; ib++)
                 {
+                    const int ib2 = ib*2;
+                    FPTYPE fac = d_wg[ik * wg_nc + ib];
                     for (int ip1 = ip_begin; ip1 < ip_end; ip1++)
                     {
                         const int m1 = ip1 - ip_begin;
@@ -270,18 +268,16 @@ struct cal_stress_nl_op<FPTYPE, base_device::DEVICE_CPU>
                             local_stress -= fac * (ps[0] * dbb0 + ps[1] * dbb1 + ps[2] * dbb2 + ps[3] * dbb3).real();
                         }
                     } // end ip
-                    vu += 4 * tlp1_2;// step for vu
-                }     // ia
-            }
+                }// ib
+                vu += 4 * tlp1_2;// step for vu
+            }// ia
             sum += atom_na[it] * nproj;
             iat += atom_na[it];
         } // end it
-        stress[ipol * 3 + jpol] += local_stress;
+        *stress += local_stress;
     };
     // kernel for DeltaSpin 
     void operator()(const base_device::DEVICE_CPU* ctx,
-                    const int& ipol,
-                    const int& jpol,
                     const int& nkb,
                     const int& nbands_occ,
                     const int& ntype,
@@ -300,17 +296,17 @@ struct cal_stress_nl_op<FPTYPE, base_device::DEVICE_CPU>
         for (int it = 0; it < ntype; it++)
         {
             const int nproj = atom_nh[it];
-            for (int ib = 0; ib < nbands_occ; ib++)
+            for (int ia = 0; ia < atom_na[it]; ia++)
             {
-                const int ib2 = ib*2;
-                FPTYPE fac = d_wg[ik * wg_nc + ib];
-                for (int ia = 0; ia < atom_na[it]; ia++)
+                int iat = iat0 + ia;
+                const std::complex<FPTYPE> coefficients0(lambda[iat*3+2], 0.0);
+                const std::complex<FPTYPE> coefficients1(lambda[iat*3] , lambda[iat*3+1]);
+                const std::complex<FPTYPE> coefficients2(lambda[iat*3] , -1 * lambda[iat*3+1]);
+                const std::complex<FPTYPE> coefficients3(-1 * lambda[iat*3+2], 0.0);
+                for (int ib = 0; ib < nbands_occ; ib++)
                 {
-                    int iat = iat0 + ia;
-                    const std::complex<FPTYPE> coefficients0(lambda[iat*3+2], 0.0);
-                    const std::complex<FPTYPE> coefficients1(lambda[iat*3] , lambda[iat*3+1]);
-                    const std::complex<FPTYPE> coefficients2(lambda[iat*3] , -1 * lambda[iat*3+1]);
-                    const std::complex<FPTYPE> coefficients3(-1 * lambda[iat*3+2], 0.0);
+                    const int ib2 = ib*2;
+                    FPTYPE fac = d_wg[ik * wg_nc + ib];
                     for (int ip = 0; ip < nproj; ip++)
                     {
                         const int inkb1 = ib2 * nkb + sum + ia * nproj + ip;
@@ -321,12 +317,12 @@ struct cal_stress_nl_op<FPTYPE, base_device::DEVICE_CPU>
                         const std::complex<FPTYPE> dbb3 = conj(dbecp[nkb + inkb1]) * becp[nkb + inkb1];
                         local_stress -= fac * (coefficients0 * dbb0 + coefficients1 * dbb1 + coefficients2 * dbb2 + coefficients3 * dbb3).real();
                     } // end ip
-                }     // ia
-            }
+                }// ib
+            }// ia
             sum += atom_na[it] * nproj;
             iat0 += atom_na[it];
         } // end it
-        stress[ipol * 3 + jpol] += local_stress;
+        *stress += local_stress;
     }
 };
 
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h
index 77c42740e6..ad02359d1c 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h
@@ -126,8 +126,6 @@ struct cal_stress_nl_op
                     FPTYPE* stress);
     // kernel for DFT+U
     void operator()(const base_device::DEVICE_CPU* ctx,
-                    const int& ipol,
-                    const int& jpol,
                     const int& nkb,
                     const int& nbands_occ,
                     const int& ntype,
@@ -143,8 +141,6 @@ struct cal_stress_nl_op
                     FPTYPE* stress);
     // kernel for DeltaSpin
     void operator()(const base_device::DEVICE_CPU* ctx,
-                    const int& ipol,
-                    const int& jpol,
                     const int& nkb,
                     const int& nbands_occ,
                     const int& ntype,
diff --git a/source/module_hamilt_pw/hamilt_pwdft/stress_func.h b/source/module_hamilt_pw/hamilt_pwdft/stress_func.h
index f455215f63..80f4760df3 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/stress_func.h
+++ b/source/module_hamilt_pw/hamilt_pwdft/stress_func.h
@@ -56,8 +56,8 @@ template <typename FPTYPE, typename Device = base_device::DEVICE_CPU>
 class Stress_Func
 {
   public:
-    Stress_Func(){};
-    ~Stress_Func(){};
+    Stress_Func() {};
+    ~Stress_Func() {};
 
     // stress functions
     //  1) the stress from the electron kinetic energy
@@ -119,7 +119,7 @@ class Stress_Func
                      FPTYPE* drhocg,
                      ModulePW::PW_Basis* rho_basis,
                      int type); // used in nonlinear core correction stress
-                     
+
     // 6) the stress from the exchange-correlation functional term
     void stress_gga(ModuleBase::matrix& sigma,
                     ModulePW::PW_Basis* rho_basis,
@@ -134,7 +134,7 @@ class Stress_Func
 
     // 7) the stress from the non-local pseudopotentials
     /**
-     * @brief This routine computes the atomic force of non-local pseudopotential
+     * @brief This routine computes the stress contribution of non-local pseudopotential
      *    Stress^{NL}_{ij} = -1/\Omega \sum_{n,k}f_{nk}\sum_I \sum_{lm,l'm'}D_{l,l'}^{I} [
      *               \sum_G \langle c_{nk}(\mathbf{G+K})|\beta_{lm}^I(\mathbf{G+K})\rangle *
      *               \sum_{G'}\langle \partial \beta_{lm}^I(\mathbf{G+K})/\partial \varepsilon_{ij}
@@ -153,6 +153,24 @@ class Stress_Func
                    pseudopot_cell_vnl* nlpp_in,
                    const UnitCell& ucell_in); // nonlocal part in PW basis
 
+    // 7) the stress from the DFT+U and DeltaSpin calculations
+    /**
+     * @brief This routine computes the stress contribution from the DFT+U and DeltaSpin calculations
+     *    Stress^{NL}_{ij} = -1/\Omega \sum_{n,k}f_{nk}\sum_I \sum_{lm,l'm'}(V^U_{lmm'\sigma\sigma'} +
+     * f(\lambda,\sigma\sigma')) [ \sum_G \langle c_{nk}(\mathbf{G+K})|\alpha_{lm}^I(\mathbf{G+K})\rangle *
+     *               \sum_{G'}\langle \partial \alpha_{lm}^I(\mathbf{G+K})/\partial \varepsilon_{ij}
+     * |c_{nk}(\mathbf{G+K})\rangle ] there would be three parts in the above equation: (1) sum over becp and dbecp with
+     * f(U+\lambda, \sigma\sigma', lmm')^{I} ----- first line in the above equation (2) calculate becp = <psi | alpha>
+     * ----- second line in the above equation (3) calculate dbecp = <psi | dalpha> ----- third line in the above
+     * equation
+     */
+    void stress_onsite(ModuleBase::matrix& sigma,
+                       const ModuleBase::matrix& wg,
+                       const ModulePW::PW_Basis_K* wfc_basis,
+                       const UnitCell& ucell_in,
+                       const psi::Psi<complex<FPTYPE>, Device>* psi_in,
+                       ModuleSymmetry::Symmetry* p_symm); // nonlocal part in PW basis
+
     void get_dvnl1(ModuleBase::ComplexMatrix& vkb,
                    const int ik,
                    const int ipol,
diff --git a/source/module_hamilt_pw/hamilt_pwdft/stress_func_onsite.cpp b/source/module_hamilt_pw/hamilt_pwdft/stress_func_onsite.cpp
new file mode 100644
index 0000000000..ae66be1438
--- /dev/null
+++ b/source/module_hamilt_pw/hamilt_pwdft/stress_func_onsite.cpp
@@ -0,0 +1,113 @@
+#include "module_base/module_device/device.h"
+#include "module_base/timer.h"
+#include "module_hamilt_pw/hamilt_pwdft/onsite_projector.h"
+#include "module_parameter/parameter.h"
+#include "module_hamilt_lcao/module_dftu/dftu.h"
+#include "module_hamilt_lcao/module_deltaspin/spin_constrain.h"
+#include "stress_func.h"
+// calculate the nonlocal pseudopotential stress in PW
+template <typename FPTYPE, typename Device>
+void Stress_Func<FPTYPE, Device>::stress_onsite(ModuleBase::matrix& sigma,
+                                            const ModuleBase::matrix& wg,
+                                            const ModulePW::PW_Basis_K* wfc_basis,
+                                            const UnitCell& ucell_in,
+                                            const psi::Psi<complex<FPTYPE>, Device>* psi_in,
+                                            ModuleSymmetry::Symmetry* p_symm)
+{
+    ModuleBase::TITLE("Stress_Func", "stress_onsite");
+    if(psi_in == nullptr || wfc_basis == nullptr)
+    {
+        return;
+    }
+    ModuleBase::timer::tick("Stress_Func", "stress_onsite");
+
+    FPTYPE* stress_device = nullptr;
+    resmem_var_op()(this->ctx, stress_device, 9);
+    setmem_var_op()(this->ctx, stress_device, 0, 9);
+    std::vector<FPTYPE> sigma_onsite(9, 0.0);
+
+    auto* onsite_p = projectors::OnsiteProjector<FPTYPE, Device>::get_instance();
+
+    const int nks = wfc_basis->nks;
+    for (int ik = 0; ik < nks; ik++) // loop k points
+    {
+        // skip zero weights to speed up
+        int nbands_occ = wg.nc;
+        while (wg(ik, nbands_occ - 1) == 0.0)
+        {
+            nbands_occ--;
+            if (nbands_occ == 0)
+            {
+                break;
+            }
+        }
+        const int npm = nbands_occ;
+
+        // calculate becp = <psi|beta> for all beta functions
+        onsite_p->get_fs_tools()->cal_becp(ik, npm);
+        // calculate dbecp = <psi|d(beta)/dR> for all beta functions
+        // calculate stress = \sum <psi|d(beta_j)/dR> * <psi|beta_i> * D_{ij}
+        for (int ipol = 0; ipol < 3; ipol++)
+        {
+            for (int jpol = 0; jpol <= ipol; jpol++)
+            {
+                FPTYPE* stress_device_tmp = stress_device + (ipol * 3 + jpol);
+                onsite_p->get_fs_tools()->cal_dbecp_s(ik, npm, ipol, jpol, nullptr);
+                if(PARAM.inp.dft_plus_u)
+                {
+                    auto* dftu = ModuleDFTU::DFTU::get_instance();
+                    onsite_p->get_fs_tools()->cal_stress_dftu(ik, npm, stress_device_tmp, dftu->orbital_corr, dftu->get_eff_pot_pw(0));
+                }
+                if(PARAM.inp.sc_mag_switch)
+                {
+                    SpinConstrain<std::complex<double>, Device>& sc = SpinConstrain<std::complex<double>, Device>::getScInstance();
+                    const std::vector<ModuleBase::Vector3<double>>& lambda = sc.get_sc_lambda();
+                    onsite_p->get_fs_tools()->cal_stress_dspin(ik, npm, stress_device_tmp, lambda.data());
+                }
+            }
+        }
+    }
+    // transfer stress from device to host
+    syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, sigma_onsite.data(), stress_device, 9);
+    delmem_var_op()(this->ctx, stress_device);
+    // sum up forcenl from all processors
+    for (int l = 0; l < 3; l++)
+    {
+        for (int m = 0; m < 3; m++)
+        {
+            if (m > l)
+            {
+                sigma_onsite[l * 3 + m] = sigma_onsite[m * 3 + l];
+            }
+            Parallel_Reduce::reduce_all(sigma_onsite[l * 3 + m]); // qianrui fix a bug for kpar > 1
+        }
+    }
+    // rescale the stress with 1/omega
+    for (int ipol = 0; ipol < 3; ipol++)
+    {
+        for (int jpol = 0; jpol < 3; jpol++)
+        {
+            sigma_onsite[ipol * 3 + jpol] *= 1.0 / ucell_in.omega;
+        }
+    }
+
+    for (int ipol = 0; ipol < 3; ipol++)
+    {
+        for (int jpol = 0; jpol < 3; jpol++)
+        {
+            sigma(ipol, jpol) = sigma_onsite[ipol * 3 + jpol];
+        }
+    }
+    // do symmetry
+    if (ModuleSymmetry::Symmetry::symm_flag == 1)
+    {
+        p_symm->symmetrize_mat3(sigma, ucell_in.lat);
+    } // end symmetry
+
+    ModuleBase::timer::tick("Stress_Func", "stress_onsite");
+}
+
+template class Stress_Func<double, base_device::DEVICE_CPU>;
+#if ((defined __CUDA) || (defined __ROCM))
+template class Stress_Func<double, base_device::DEVICE_GPU>;
+#endif
\ No newline at end of file
diff --git a/source/module_hamilt_pw/hamilt_pwdft/stress_pw.cpp b/source/module_hamilt_pw/hamilt_pwdft/stress_pw.cpp
index a566dda0c5..b9910b27a8 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/stress_pw.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/stress_pw.cpp
@@ -46,6 +46,9 @@ void Stress_PW<FPTYPE, Device>::cal_stress(ModuleBase::matrix& sigmatot,
     // vdw stress
     ModuleBase::matrix sigmavdw;
     sigmavdw.create(3, 3);
+    // DFT+U and DeltaSpin stress
+    ModuleBase::matrix sigmaonsite;
+    sigmaonsite.create(3, 3);
 
     for (int i = 0; i < 3; i++)
     {
@@ -60,6 +63,7 @@ void Stress_PW<FPTYPE, Device>::cal_stress(ModuleBase::matrix& sigmatot,
             sigmaewa(i, j) = 0.0;
             sigmaxcc(i, j) = 0.0;
             sigmavdw(i, j) = 0.0;
+            sigmaonsite(i, j) = 0.0;
         }
     }
 
@@ -107,13 +111,19 @@ void Stress_PW<FPTYPE, Device>::cal_stress(ModuleBase::matrix& sigmatot,
     // vdw term
     stress_vdw(sigmavdw, ucell);
 
+    // DFT+U and DeltaSpin stress
+    if(PARAM.inp.dft_plus_u || PARAM.inp.sc_mag_switch)
+    {
+        this->stress_onsite(sigmaonsite, this->pelec->wg, wfc_basis, ucell, psi_in, p_symm);
+    }
+
     for (int ipol = 0; ipol < 3; ipol++)
     {
         for (int jpol = 0; jpol < 3; jpol++)
         {
             sigmatot(ipol, jpol) = sigmakin(ipol, jpol) + sigmahar(ipol, jpol) + sigmanl(ipol, jpol)
                                    + sigmaxc(ipol, jpol) + sigmaxcc(ipol, jpol) + sigmaewa(ipol, jpol)
-                                   + sigmaloc(ipol, jpol) + sigmavdw(ipol, jpol);
+                                   + sigmaloc(ipol, jpol) + sigmavdw(ipol, jpol) + sigmaonsite(ipol, jpol);
         }
     }
 
@@ -138,6 +148,10 @@ void Stress_PW<FPTYPE, Device>::cal_stress(ModuleBase::matrix& sigmatot,
         ModuleIO::print_stress("XC    STRESS", sigmaxc, GlobalV::TEST_STRESS, ry);
         ModuleIO::print_stress("EWALD    STRESS", sigmaewa, GlobalV::TEST_STRESS, ry);
         ModuleIO::print_stress("NLCC    STRESS", sigmaxcc, GlobalV::TEST_STRESS, ry);
+        if(PARAM.inp.dft_plus_u || PARAM.inp.sc_mag_switch)
+        {
+            ModuleIO::print_stress("ONSITE    STRESS", sigmaonsite, GlobalV::TEST_STRESS, ry);
+        }
         ModuleIO::print_stress("TOTAL    STRESS", sigmatot, GlobalV::TEST_STRESS, ry);
     }
     ModuleBase::timer::tick("Stress_PW", "cal_stress");