From c324dfb421be696267081560073f2f1833cbe520 Mon Sep 17 00:00:00 2001 From: dyzheng Date: Mon, 2 Sep 2024 16:07:23 +0800 Subject: [PATCH] Feature: add stress for DFT+U and DeltaSpin in PW code --- .../hamilt_pwdft/CMakeLists.txt | 1 + .../hamilt_pwdft/forces_onsite.cpp | 5 - .../hamilt_pwdft/fs_nonlocal_tools.cpp | 457 +++++++++++------- .../hamilt_pwdft/fs_nonlocal_tools.h | 7 +- .../hamilt_pwdft/kernels/stress_op.cpp | 44 +- .../hamilt_pwdft/kernels/stress_op.h | 4 - .../hamilt_pwdft/stress_func.h | 26 +- .../hamilt_pwdft/stress_func_onsite.cpp | 113 +++++ .../hamilt_pwdft/stress_pw.cpp | 16 +- 9 files changed, 452 insertions(+), 221 deletions(-) create mode 100644 source/module_hamilt_pw/hamilt_pwdft/stress_func_onsite.cpp diff --git a/source/module_hamilt_pw/hamilt_pwdft/CMakeLists.txt b/source/module_hamilt_pw/hamilt_pwdft/CMakeLists.txt index b85287fe2f..206f434d82 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/CMakeLists.txt +++ b/source/module_hamilt_pw/hamilt_pwdft/CMakeLists.txt @@ -22,6 +22,7 @@ list(APPEND objects stress_func_kin.cpp stress_func_loc.cpp stress_func_nl.cpp + stress_func_onsite.cpp stress_func_us.cpp stress_pw.cpp VL_in_pw.cpp diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces_onsite.cpp b/source/module_hamilt_pw/hamilt_pwdft/forces_onsite.cpp index 99668cd78c..2c2d710848 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/forces_onsite.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/forces_onsite.cpp @@ -7,11 +7,6 @@ #include "module_hamilt_lcao/module_dftu/dftu.h" #include "module_hamilt_lcao/module_deltaspin/spin_constrain.h" - -#ifdef _OPENMP -#include -#endif - template void Forces::cal_force_onsite(ModuleBase::matrix& force_onsite, const ModuleBase::matrix& wg, diff --git a/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp b/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp index ae668ce811..b6e62fec68 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp @@ -7,6 +7,7 @@ #include "module_base/tool_title.h" #include "module_hamilt_pw/hamilt_pwdft/kernels/force_op.h" #include "nonlocal_maths.hpp" + #include namespace hamilt @@ -27,7 +28,7 @@ FS_Nonlocal_tools::FS_Nonlocal_tools(const pseudopot_cell_vnl* n // seems kvec_c never used... this->kvec_c = this->wfc_basis_->template get_kvec_c_data(); - // the following is important for calculating the whole contribution to + // the following is important for calculating the whole contribution to // Hamiltonian or force, stress: sum{nk} fnk*sum_{ij}Dij // among, Dij is deeq. // For DFT+U and other projection involved operators, deeq also plays. @@ -68,22 +69,24 @@ FS_Nonlocal_tools::FS_Nonlocal_tools(const pseudopot_cell_vnl* n } // allocate memory this->allocate_memory(wg, ekb, this->nproj, nch); - this->ppcell_vkb = (this->device == base_device::GpuDevice)? this->nlpp_->template get_vkb_data() : this->nlpp_->vkb.c; + this->ppcell_vkb + = (this->device == base_device::GpuDevice) ? this->nlpp_->template get_vkb_data() : this->nlpp_->vkb.c; } template -FS_Nonlocal_tools::FS_Nonlocal_tools(const std::vector& nproj, // number of projectors for each atom type - const std::vector& lproj, - const ModuleBase::realArray& tab, // radials' spherical bessel transform - const ModuleBase::matrix& nhtol, // (it, ich) -> l, the ich is (l, m)-distinctive index - std::complex* vkb_buf, // the vkb buffer - const UnitCell* ucell_in, - const psi::Psi, Device>* psi_in, - const K_Vectors* kv_in, - const ModulePW::PW_Basis_K* wfc_basis_in, - const Structure_Factor* sf_in, - const ModuleBase::matrix& wg, - const ModuleBase::matrix& ekb) +FS_Nonlocal_tools::FS_Nonlocal_tools( + const std::vector& nproj, // number of projectors for each atom type + const std::vector& lproj, + const ModuleBase::realArray& tab, // radials' spherical bessel transform + const ModuleBase::matrix& nhtol, // (it, ich) -> l, the ich is (l, m)-distinctive index + std::complex* vkb_buf, // the vkb buffer + const UnitCell* ucell_in, + const psi::Psi, Device>* psi_in, + const K_Vectors* kv_in, + const ModulePW::PW_Basis_K* wfc_basis_in, + const Structure_Factor* sf_in, + const ModuleBase::matrix& wg, + const ModuleBase::matrix& ekb) { // this is a constructor for general case, including vnl, dftu, deltaspin, deepks, etc. // what is needed for this kind of constructor? @@ -95,7 +98,6 @@ FS_Nonlocal_tools::FS_Nonlocal_tools(const std::vector& npr // rgrid: radial grid // deeq: the Dij matrix, Hubbard parameters or other quantities... - // what are already programmed to be needed? // tab: the spherical transform of radial functions, with q = linspace(0, GlobalV::NQX, GlobalV::DQ) @@ -105,15 +107,14 @@ FS_Nonlocal_tools::FS_Nonlocal_tools(const std::vector& npr // h_atom_nh: counterpart of atom_nh on host // max_nh: std::max_element(atom_nh.begin(), atom_nh.end()) - // in conclusion, this constructor needs the following individual information: - - // nproj + + // nproj // tab (projs is not needed, should be calculated elsewhere) - // lproj + // lproj // deeq, with its dims. it will be good to pass the whole realarray - // what can be built here + // what can be built here // nhtol // nkb // atom_nh, h_atom_nh, max_nh @@ -165,7 +166,7 @@ FS_Nonlocal_tools::~FS_Nonlocal_tools() } template -void FS_Nonlocal_tools::allocate_memory(const ModuleBase::matrix& wg, +void FS_Nonlocal_tools::allocate_memory(const ModuleBase::matrix& wg, const ModuleBase::matrix& ekb, const std::vector& nproj, const std::vector& nch) @@ -274,22 +275,25 @@ void FS_Nonlocal_tools::delete_memory() // . the multiplication with sk should be within specific operator // because the atom selection task is operator-specific. template -void FS_Nonlocal_tools::cal_becp(int ik, int npm, std::complex* becp_in, const std::complex* ppsi_in) +void FS_Nonlocal_tools::cal_becp(int ik, + int npm, + std::complex* becp_in, + const std::complex* ppsi_in) { ModuleBase::TITLE("FS_Nonlocal_tools", "cal_becp"); ModuleBase::timer::tick("FS_Nonlocal_tools", "cal_becp"); - + const int npol = this->ucell_->get_npol(); - const std::complex* ppsi = ppsi_in==nullptr? &(this->psi_[0](ik, 0, 0)) : ppsi_in; + const std::complex* ppsi = ppsi_in == nullptr ? &(this->psi_[0](ik, 0, 0)) : ppsi_in; const int npw = this->wfc_basis_->npwk[ik]; - if(becp_in == nullptr && this->becp == nullptr) + if (becp_in == nullptr && this->becp == nullptr) { resmem_complex_op()(this->ctx, becp, this->nbands * npol * this->nkb); } - std::complex* becp_tmp = becp_in==nullptr? this->becp : becp_in; + std::complex* becp_tmp = becp_in == nullptr ? this->becp : becp_in; const int size_becp_act = npm * npol * this->nkb; - if(ik != this->current_ik)// different ik, need to recalculate vkb - { + if (ik != this->current_ik) // different ik, need to recalculate vkb + { const int size_becp = this->nbands * npol * this->nkb; if (this->becp == nullptr) { @@ -363,7 +367,7 @@ void FS_Nonlocal_tools::cal_becp(int ik, int npm, std::complex::cal_becp(int ik, int npm, std::complex> pref = maths.cal_pref(it, h_atom_nh[it]); const int nh = pref.size(); this->dvkb_indexes.resize(nh * 4); // print the value of nhtol // nhtol->print(std::cout); // as checked, nhtol works as expected - maths.cal_dvkb_index(nproj[it], - this->nhtol->c, - this->nhtol->nc, - npw, - it, - 0, - 0, - this->dvkb_indexes.data()); - + maths.cal_dvkb_index(nproj[it], this->nhtol->c, this->nhtol->nc, npw, it, 0, 0, this->dvkb_indexes.data()); + if (this->device == base_device::GpuDevice) { syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, d_dvkb_indexes, dvkb_indexes.data(), nh * 4); @@ -443,7 +440,6 @@ void FS_Nonlocal_tools::cal_becp(int ik, int npm, std::complexnkb); - if (this->device == base_device::GpuDevice) { std::complex* h_becp = nullptr; @@ -595,54 +591,57 @@ void FS_Nonlocal_tools::cal_dbecp_s(int ik, int npm, int ipol, i dbecp, nkb); // calculate stress for target (ipol, jpol) - if(npol == 1) + if (stress != nullptr) { - const int current_spin = this->kv_->isk[ik]; - cal_stress_nl_op()(this->ctx, - nondiagonal, - ipol, - jpol, - nkb, - npm, - this->ntype, - current_spin, // uspp only - this->nbands, - ik, - this->deeq_dims[1], - this->deeq_dims[2], - this->deeq_dims[3], - atom_nh, - atom_na, - d_wg, - d_ekb, - qq_nt, - deeq, - becp, - dbecp, - stress); - } - else - { - cal_stress_nl_op()(this->ctx, - ipol, - jpol, - nkb, - npm, - this->ntype, - this->nbands, - ik, - this->deeq_nc_dims[1], - this->deeq_nc_dims[2], - this->deeq_nc_dims[3], - atom_nh, - atom_na, - d_wg, - d_ekb, - qq_nt, - this->deeq_nc, - becp, - dbecp, - stress); + if (npol == 1) + { + const int current_spin = this->kv_->isk[ik]; + cal_stress_nl_op()(this->ctx, + nondiagonal, + ipol, + jpol, + nkb, + npm, + this->ntype, + current_spin, // uspp only + this->nbands, + ik, + this->deeq_dims[1], + this->deeq_dims[2], + this->deeq_dims[3], + atom_nh, + atom_na, + d_wg, + d_ekb, + qq_nt, + deeq, + becp, + dbecp, + stress); + } + else + { + cal_stress_nl_op()(this->ctx, + ipol, + jpol, + nkb, + npm, + this->ntype, + this->nbands, + ik, + this->deeq_nc_dims[1], + this->deeq_nc_dims[2], + this->deeq_nc_dims[3], + atom_nh, + atom_na, + d_wg, + d_ekb, + qq_nt, + this->deeq_nc, + becp, + dbecp, + stress); + } } ModuleBase::timer::tick("FS_Nonlocal_tools", "cal_dbecp_s"); } @@ -700,7 +699,7 @@ void FS_Nonlocal_tools::cal_dbecp_f(int ik, int npm, int ipol) const int npm_npol = npm * npol; const int size_becp = this->nbands * npol * this->nkb; if (this->dbecp == nullptr) // if it is the very first run, we allocate - { // why not judging whether dbecp == nullptr inside resmem_complex_op? + { // why not judging whether dbecp == nullptr inside resmem_complex_op? resmem_complex_op()(this->ctx, dbecp, 3 * size_becp); } // do gemm to get dbecp and revert the ppcell_vkb for next ipol @@ -806,7 +805,8 @@ template void FS_Nonlocal_tools::transfer_gcar(int npw, int npw_max, const FPTYPE* gcar_in) { std::vector gcar_tmp(3 * npw_max); // [out], will overwritten this->gcar - gcar_tmp.assign(gcar_in, gcar_in + 3 * npw_max); // UNDEFINED BEHAVIOR!!! nobody always knows the memory layout of vector3 + gcar_tmp.assign(gcar_in, + gcar_in + 3 * npw_max); // UNDEFINED BEHAVIOR!!! nobody always knows the memory layout of vector3 std::vector gcar_zero_indexes_tmp(3 * npw_max); // a "checklist" int* gcar_zero_ptrs[3]; @@ -865,41 +865,17 @@ void FS_Nonlocal_tools::cal_force(int ik, int npm, FPTYPE* force const int current_spin = this->kv_->isk[ik]; const int force_nc = 3; // calculate the force - if(this->ucell_->get_npol() == 1) - { - cal_force_nl_op()(this->ctx, - nondiagonal, - npm, - this->nbands, - this->ntype, - current_spin, - this->deeq_dims[1], - this->deeq_dims[2], - this->deeq_dims[3], - force_nc, - this->nbands, - ik, - nkb, - atom_nh, - atom_na, - this->ucell_->tpiba, - d_wg, - d_ekb, - qq_nt, - deeq, - becp, - dbecp, - force); - } - else + if (this->ucell_->get_npol() == 1) { cal_force_nl_op()(this->ctx, + nondiagonal, npm, this->nbands, this->ntype, - this->deeq_nc_dims[1], - this->deeq_nc_dims[2], - this->deeq_nc_dims[3], + current_spin, + this->deeq_dims[1], + this->deeq_dims[2], + this->deeq_dims[3], force_nc, this->nbands, ik, @@ -910,31 +886,20 @@ void FS_Nonlocal_tools::cal_force(int ik, int npm, FPTYPE* force d_wg, d_ekb, qq_nt, - this->deeq_nc, + deeq, becp, dbecp, force); } -} - -template -void FS_Nonlocal_tools::cal_force_dftu(int ik, int npm, FPTYPE* force, const int* orbital_corr, const std::complex* vu) -{ - int* orbital_corr_tmp = const_cast(orbital_corr); - std::complex* vu_tmp = const_cast*>(vu); -#if defined(__CUDA) || defined(__ROCM) - if (this->device == "gpu") { - resmem_int_op()(gpu_ctx, orbital_corr_tmp, this->ucell_->ntype); - syncmem_int_h2d_op()(gpu_ctx, cpu_ctx, orbital_corr_tmp, dftu->orbital_corr, this->ucell_->ntype); - resmem_cd_op()(gpu_ctx, vu_tmp, dftu->get_size_eff_pot_pw()); - syncmem_cd_h2d_op()(gpu_ctx, cpu_ctx, vu_tmp, dftu->get_eff_pot_pw(0), dftu->get_size_eff_pot_pw()); - } -#endif - const int force_nc = 3; - cal_force_nl_op()(this->ctx, + else + { + cal_force_nl_op()(this->ctx, npm, this->nbands, this->ntype, + this->deeq_nc_dims[1], + this->deeq_nc_dims[2], + this->deeq_nc_dims[3], force_nc, this->nbands, ik, @@ -943,60 +908,193 @@ void FS_Nonlocal_tools::cal_force_dftu(int ik, int npm, FPTYPE* atom_na, this->ucell_->tpiba, d_wg, - vu_tmp, - orbital_corr_tmp, + d_ekb, + qq_nt, + this->deeq_nc, becp, dbecp, force); + } +} + +template +void FS_Nonlocal_tools::cal_force_dftu(int ik, + int npm, + FPTYPE* force, + const int* orbital_corr, + const std::complex* vu) +{ + int* orbital_corr_tmp = const_cast(orbital_corr); + std::complex* vu_tmp = const_cast*>(vu); #if defined(__CUDA) || defined(__ROCM) - if (this->device == "gpu") { - delmem_cd_op()(gpu_ctx, vu_tmp); - delmem_int_op()(gpu_ctx, orbital_corr_tmp); - } + if (this->device == "gpu") + { + resmem_int_op()(gpu_ctx, orbital_corr_tmp, this->ucell_->ntype); + syncmem_int_h2d_op()(gpu_ctx, cpu_ctx, orbital_corr_tmp, dftu->orbital_corr, this->ucell_->ntype); + resmem_cd_op()(gpu_ctx, vu_tmp, dftu->get_size_eff_pot_pw()); + syncmem_cd_h2d_op()(gpu_ctx, cpu_ctx, vu_tmp, dftu->get_eff_pot_pw(0), dftu->get_size_eff_pot_pw()); + } +#endif + const int force_nc = 3; + cal_force_nl_op()(this->ctx, + npm, + this->nbands, + this->ntype, + force_nc, + this->nbands, + ik, + nkb, + atom_nh, + atom_na, + this->ucell_->tpiba, + d_wg, + vu_tmp, + orbital_corr_tmp, + becp, + dbecp, + force); +#if defined(__CUDA) || defined(__ROCM) + if (this->device == "gpu") + { + delmem_cd_op()(gpu_ctx, vu_tmp); + delmem_int_op()(gpu_ctx, orbital_corr_tmp); + } #endif } template -void FS_Nonlocal_tools::cal_force_dspin(int ik, int npm, FPTYPE* force, const ModuleBase::Vector3* lambda) +void FS_Nonlocal_tools::cal_force_dspin(int ik, + int npm, + FPTYPE* force, + const ModuleBase::Vector3* lambda) { - FPTYPE* lambda_array = nullptr; - resmem_var_op()(this->cpu_ctx, lambda_array, this->ucell_->nat*3); - for(int iat = 0; iat < this->ucell_->nat; iat++) - { - lambda_array[iat*3] = lambda[iat].x; - lambda_array[iat*3+1] = lambda[iat].y; - lambda_array[iat*3+2] = lambda[iat].z; - } - FPTYPE* lambda_tmp = lambda_array; + FPTYPE* lambda_array = nullptr; + resmem_var_op()(this->cpu_ctx, lambda_array, this->ucell_->nat * 3); + for (int iat = 0; iat < this->ucell_->nat; iat++) + { + lambda_array[iat * 3] = lambda[iat].x; + lambda_array[iat * 3 + 1] = lambda[iat].y; + lambda_array[iat * 3 + 2] = lambda[iat].z; + } + FPTYPE* lambda_tmp = lambda_array; #if defined(__CUDA) || defined(__ROCM) - if (this->device == "gpu") { - resmem_var_op()(this->ctx, lambda_tmp, this->ucell_->nat*3); - syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, lambda_tmp, lambda_array, this->ucell_->nat*3); - } + if (this->device == "gpu") + { + resmem_var_op()(this->ctx, lambda_tmp, this->ucell_->nat * 3); + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, lambda_tmp, lambda_array, this->ucell_->nat * 3); + } #endif - const int force_nc = 3; - cal_force_nl_op()(this->ctx, - npm, - this->nbands, - this->ntype, - force_nc, - this->nbands, - ik, - nkb, - atom_nh, - atom_na, - this->ucell_->tpiba, - d_wg, - lambda_tmp, - becp, - dbecp, - force); + const int force_nc = 3; + cal_force_nl_op()(this->ctx, + npm, + this->nbands, + this->ntype, + force_nc, + this->nbands, + ik, + nkb, + atom_nh, + atom_na, + this->ucell_->tpiba, + d_wg, + lambda_tmp, + becp, + dbecp, + force); + + delmem_var_op()(this->ctx, lambda_array); +#if defined(__CUDA) || defined(__ROCM) + if (this->device == "gpu") + { + delmem_var_op()(this->cpu_ctx, lambda_tmp); + } +#endif +} - delmem_var_op()(this->ctx, lambda_array); +template +void FS_Nonlocal_tools::cal_stress_dftu(int ik, + int npm, + FPTYPE* stress, + const int* orbital_corr, + const std::complex* vu) +{ + int* orbital_corr_tmp = const_cast(orbital_corr); + std::complex* vu_tmp = const_cast*>(vu); #if defined(__CUDA) || defined(__ROCM) - if (this->device == "gpu") { - delmem_var_op()(this->cpu_ctx, lambda_tmp); - } + if (this->device == "gpu") + { + resmem_int_op()(gpu_ctx, orbital_corr_tmp, this->ucell_->ntype); + syncmem_int_h2d_op()(gpu_ctx, cpu_ctx, orbital_corr_tmp, dftu->orbital_corr, this->ucell_->ntype); + resmem_cd_op()(gpu_ctx, vu_tmp, dftu->get_size_eff_pot_pw()); + syncmem_cd_h2d_op()(gpu_ctx, cpu_ctx, vu_tmp, dftu->get_eff_pot_pw(0), dftu->get_size_eff_pot_pw()); + } +#endif + cal_stress_nl_op()(this->ctx, + nkb, + npm, + this->ntype, + this->nbands, + ik, + atom_nh, + atom_na, + d_wg, + vu_tmp, + orbital_corr_tmp, + becp, + dbecp, + stress); +#if defined(__CUDA) || defined(__ROCM) + if (this->device == "gpu") + { + delmem_cd_op()(gpu_ctx, vu_tmp); + delmem_int_op()(gpu_ctx, orbital_corr_tmp); + } +#endif +} + +template +void FS_Nonlocal_tools::cal_stress_dspin(int ik, + int npm, + FPTYPE* stress, + const ModuleBase::Vector3* lambda) +{ + FPTYPE* lambda_array = nullptr; + resmem_var_op()(this->cpu_ctx, lambda_array, this->ucell_->nat * 3); + for (int iat = 0; iat < this->ucell_->nat; iat++) + { + lambda_array[iat * 3] = lambda[iat].x; + lambda_array[iat * 3 + 1] = lambda[iat].y; + lambda_array[iat * 3 + 2] = lambda[iat].z; + } + FPTYPE* lambda_tmp = lambda_array; +#if defined(__CUDA) || defined(__ROCM) + if (this->device == "gpu") + { + resmem_var_op()(this->ctx, lambda_tmp, this->ucell_->nat * 3); + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, lambda_tmp, lambda_array, this->ucell_->nat * 3); + } +#endif + const int force_nc = 3; + cal_stress_nl_op()(this->ctx, + nkb, + npm, + this->ntype, + this->nbands, + ik, + atom_nh, + atom_na, + d_wg, + lambda_tmp, + becp, + dbecp, + stress); + + delmem_var_op()(this->ctx, lambda_array); +#if defined(__CUDA) || defined(__ROCM) + if (this->device == "gpu") + { + delmem_var_op()(this->cpu_ctx, lambda_tmp); + } #endif } @@ -1006,7 +1104,4 @@ template class FS_Nonlocal_tools; template class FS_Nonlocal_tools; #endif - - - } // namespace hamilt diff --git a/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.h b/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.h index f836560ddd..658d8e2281 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.h +++ b/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.h @@ -63,7 +63,7 @@ class FS_Nonlocal_tools * @brief calculate the dbecp_{ij} = for all beta functions * stress_{ij} = -1/omega \sum_{n,k}f_{nk} \sum_I \sum_{lm,l'm'}D_{l,l'}^{I} becp * dbecp_{ij} also calculated */ - void cal_dbecp_s(int ik, int npm, int ipol, int jpol, FPTYPE* stress); + void cal_dbecp_s(int ik, int npm, int ipol, int jpol, FPTYPE* stress = nullptr); /** * @brief calculate the dbecp_i = for all beta functions */ @@ -75,7 +75,10 @@ class FS_Nonlocal_tools void cal_force_dftu(int ik, int npm, FPTYPE* force, const int* orbital_corr, const std::complex* vu); void cal_force_dspin(int ik, int npm, FPTYPE* force, const ModuleBase::Vector3* lambda); - + void cal_stress_dftu(int ik, int npm, FPTYPE* stress, const int* orbital_corr, const std::complex* vu); + void cal_stress_dspin(int ik, int npm, FPTYPE* stress, const ModuleBase::Vector3* lambda); + + std::complex* get_becp() { return becp; } std::complex* get_dbecp() { return dbecp; } diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp index 70e3311339..aeae2acbf9 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp @@ -211,8 +211,6 @@ struct cal_stress_nl_op }; // kernel for DFT+U void operator()(const base_device::DEVICE_CPU* ctx, - const int& ipol, - const int& jpol, const int& nkb, const int& nbands_occ, const int& ntype, @@ -242,12 +240,12 @@ struct cal_stress_nl_op const int ip_end = (orbital_l + 1) * (orbital_l + 1); const int tlp1 = 2 * orbital_l + 1; const int tlp1_2 = tlp1 * tlp1; - for (int ib = 0; ib < nbands_occ; ib++) + for (int ia = 0; ia < atom_na[it]; ia++) { - const int ib2 = ib*2; - FPTYPE fac = d_wg[ik * wg_nc + ib]; - for (int ia = 0; ia < atom_na[it]; ia++) + for (int ib = 0; ib < nbands_occ; ib++) { + const int ib2 = ib*2; + FPTYPE fac = d_wg[ik * wg_nc + ib]; for (int ip1 = ip_begin; ip1 < ip_end; ip1++) { const int m1 = ip1 - ip_begin; @@ -270,18 +268,16 @@ struct cal_stress_nl_op local_stress -= fac * (ps[0] * dbb0 + ps[1] * dbb1 + ps[2] * dbb2 + ps[3] * dbb3).real(); } } // end ip - vu += 4 * tlp1_2;// step for vu - } // ia - } + }// ib + vu += 4 * tlp1_2;// step for vu + }// ia sum += atom_na[it] * nproj; iat += atom_na[it]; } // end it - stress[ipol * 3 + jpol] += local_stress; + *stress += local_stress; }; // kernel for DeltaSpin void operator()(const base_device::DEVICE_CPU* ctx, - const int& ipol, - const int& jpol, const int& nkb, const int& nbands_occ, const int& ntype, @@ -300,17 +296,17 @@ struct cal_stress_nl_op for (int it = 0; it < ntype; it++) { const int nproj = atom_nh[it]; - for (int ib = 0; ib < nbands_occ; ib++) + for (int ia = 0; ia < atom_na[it]; ia++) { - const int ib2 = ib*2; - FPTYPE fac = d_wg[ik * wg_nc + ib]; - for (int ia = 0; ia < atom_na[it]; ia++) + int iat = iat0 + ia; + const std::complex coefficients0(lambda[iat*3+2], 0.0); + const std::complex coefficients1(lambda[iat*3] , lambda[iat*3+1]); + const std::complex coefficients2(lambda[iat*3] , -1 * lambda[iat*3+1]); + const std::complex coefficients3(-1 * lambda[iat*3+2], 0.0); + for (int ib = 0; ib < nbands_occ; ib++) { - int iat = iat0 + ia; - const std::complex coefficients0(lambda[iat*3+2], 0.0); - const std::complex coefficients1(lambda[iat*3] , lambda[iat*3+1]); - const std::complex coefficients2(lambda[iat*3] , -1 * lambda[iat*3+1]); - const std::complex coefficients3(-1 * lambda[iat*3+2], 0.0); + const int ib2 = ib*2; + FPTYPE fac = d_wg[ik * wg_nc + ib]; for (int ip = 0; ip < nproj; ip++) { const int inkb1 = ib2 * nkb + sum + ia * nproj + ip; @@ -321,12 +317,12 @@ struct cal_stress_nl_op const std::complex dbb3 = conj(dbecp[nkb + inkb1]) * becp[nkb + inkb1]; local_stress -= fac * (coefficients0 * dbb0 + coefficients1 * dbb1 + coefficients2 * dbb2 + coefficients3 * dbb3).real(); } // end ip - } // ia - } + }// ib + }// ia sum += atom_na[it] * nproj; iat0 += atom_na[it]; } // end it - stress[ipol * 3 + jpol] += local_stress; + *stress += local_stress; } }; diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h index 77c42740e6..ad02359d1c 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h @@ -126,8 +126,6 @@ struct cal_stress_nl_op FPTYPE* stress); // kernel for DFT+U void operator()(const base_device::DEVICE_CPU* ctx, - const int& ipol, - const int& jpol, const int& nkb, const int& nbands_occ, const int& ntype, @@ -143,8 +141,6 @@ struct cal_stress_nl_op FPTYPE* stress); // kernel for DeltaSpin void operator()(const base_device::DEVICE_CPU* ctx, - const int& ipol, - const int& jpol, const int& nkb, const int& nbands_occ, const int& ntype, diff --git a/source/module_hamilt_pw/hamilt_pwdft/stress_func.h b/source/module_hamilt_pw/hamilt_pwdft/stress_func.h index f455215f63..80f4760df3 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/stress_func.h +++ b/source/module_hamilt_pw/hamilt_pwdft/stress_func.h @@ -56,8 +56,8 @@ template class Stress_Func { public: - Stress_Func(){}; - ~Stress_Func(){}; + Stress_Func() {}; + ~Stress_Func() {}; // stress functions // 1) the stress from the electron kinetic energy @@ -119,7 +119,7 @@ class Stress_Func FPTYPE* drhocg, ModulePW::PW_Basis* rho_basis, int type); // used in nonlinear core correction stress - + // 6) the stress from the exchange-correlation functional term void stress_gga(ModuleBase::matrix& sigma, ModulePW::PW_Basis* rho_basis, @@ -134,7 +134,7 @@ class Stress_Func // 7) the stress from the non-local pseudopotentials /** - * @brief This routine computes the atomic force of non-local pseudopotential + * @brief This routine computes the stress contribution of non-local pseudopotential * Stress^{NL}_{ij} = -1/\Omega \sum_{n,k}f_{nk}\sum_I \sum_{lm,l'm'}D_{l,l'}^{I} [ * \sum_G \langle c_{nk}(\mathbf{G+K})|\beta_{lm}^I(\mathbf{G+K})\rangle * * \sum_{G'}\langle \partial \beta_{lm}^I(\mathbf{G+K})/\partial \varepsilon_{ij} @@ -153,6 +153,24 @@ class Stress_Func pseudopot_cell_vnl* nlpp_in, const UnitCell& ucell_in); // nonlocal part in PW basis + // 7) the stress from the DFT+U and DeltaSpin calculations + /** + * @brief This routine computes the stress contribution from the DFT+U and DeltaSpin calculations + * Stress^{NL}_{ij} = -1/\Omega \sum_{n,k}f_{nk}\sum_I \sum_{lm,l'm'}(V^U_{lmm'\sigma\sigma'} + + * f(\lambda,\sigma\sigma')) [ \sum_G \langle c_{nk}(\mathbf{G+K})|\alpha_{lm}^I(\mathbf{G+K})\rangle * + * \sum_{G'}\langle \partial \alpha_{lm}^I(\mathbf{G+K})/\partial \varepsilon_{ij} + * |c_{nk}(\mathbf{G+K})\rangle ] there would be three parts in the above equation: (1) sum over becp and dbecp with + * f(U+\lambda, \sigma\sigma', lmm')^{I} ----- first line in the above equation (2) calculate becp = + * ----- second line in the above equation (3) calculate dbecp = ----- third line in the above + * equation + */ + void stress_onsite(ModuleBase::matrix& sigma, + const ModuleBase::matrix& wg, + const ModulePW::PW_Basis_K* wfc_basis, + const UnitCell& ucell_in, + const psi::Psi, Device>* psi_in, + ModuleSymmetry::Symmetry* p_symm); // nonlocal part in PW basis + void get_dvnl1(ModuleBase::ComplexMatrix& vkb, const int ik, const int ipol, diff --git a/source/module_hamilt_pw/hamilt_pwdft/stress_func_onsite.cpp b/source/module_hamilt_pw/hamilt_pwdft/stress_func_onsite.cpp new file mode 100644 index 0000000000..ae66be1438 --- /dev/null +++ b/source/module_hamilt_pw/hamilt_pwdft/stress_func_onsite.cpp @@ -0,0 +1,113 @@ +#include "module_base/module_device/device.h" +#include "module_base/timer.h" +#include "module_hamilt_pw/hamilt_pwdft/onsite_projector.h" +#include "module_parameter/parameter.h" +#include "module_hamilt_lcao/module_dftu/dftu.h" +#include "module_hamilt_lcao/module_deltaspin/spin_constrain.h" +#include "stress_func.h" +// calculate the nonlocal pseudopotential stress in PW +template +void Stress_Func::stress_onsite(ModuleBase::matrix& sigma, + const ModuleBase::matrix& wg, + const ModulePW::PW_Basis_K* wfc_basis, + const UnitCell& ucell_in, + const psi::Psi, Device>* psi_in, + ModuleSymmetry::Symmetry* p_symm) +{ + ModuleBase::TITLE("Stress_Func", "stress_onsite"); + if(psi_in == nullptr || wfc_basis == nullptr) + { + return; + } + ModuleBase::timer::tick("Stress_Func", "stress_onsite"); + + FPTYPE* stress_device = nullptr; + resmem_var_op()(this->ctx, stress_device, 9); + setmem_var_op()(this->ctx, stress_device, 0, 9); + std::vector sigma_onsite(9, 0.0); + + auto* onsite_p = projectors::OnsiteProjector::get_instance(); + + const int nks = wfc_basis->nks; + for (int ik = 0; ik < nks; ik++) // loop k points + { + // skip zero weights to speed up + int nbands_occ = wg.nc; + while (wg(ik, nbands_occ - 1) == 0.0) + { + nbands_occ--; + if (nbands_occ == 0) + { + break; + } + } + const int npm = nbands_occ; + + // calculate becp = for all beta functions + onsite_p->get_fs_tools()->cal_becp(ik, npm); + // calculate dbecp = for all beta functions + // calculate stress = \sum * * D_{ij} + for (int ipol = 0; ipol < 3; ipol++) + { + for (int jpol = 0; jpol <= ipol; jpol++) + { + FPTYPE* stress_device_tmp = stress_device + (ipol * 3 + jpol); + onsite_p->get_fs_tools()->cal_dbecp_s(ik, npm, ipol, jpol, nullptr); + if(PARAM.inp.dft_plus_u) + { + auto* dftu = ModuleDFTU::DFTU::get_instance(); + onsite_p->get_fs_tools()->cal_stress_dftu(ik, npm, stress_device_tmp, dftu->orbital_corr, dftu->get_eff_pot_pw(0)); + } + if(PARAM.inp.sc_mag_switch) + { + SpinConstrain, Device>& sc = SpinConstrain, Device>::getScInstance(); + const std::vector>& lambda = sc.get_sc_lambda(); + onsite_p->get_fs_tools()->cal_stress_dspin(ik, npm, stress_device_tmp, lambda.data()); + } + } + } + } + // transfer stress from device to host + syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, sigma_onsite.data(), stress_device, 9); + delmem_var_op()(this->ctx, stress_device); + // sum up forcenl from all processors + for (int l = 0; l < 3; l++) + { + for (int m = 0; m < 3; m++) + { + if (m > l) + { + sigma_onsite[l * 3 + m] = sigma_onsite[m * 3 + l]; + } + Parallel_Reduce::reduce_all(sigma_onsite[l * 3 + m]); // qianrui fix a bug for kpar > 1 + } + } + // rescale the stress with 1/omega + for (int ipol = 0; ipol < 3; ipol++) + { + for (int jpol = 0; jpol < 3; jpol++) + { + sigma_onsite[ipol * 3 + jpol] *= 1.0 / ucell_in.omega; + } + } + + for (int ipol = 0; ipol < 3; ipol++) + { + for (int jpol = 0; jpol < 3; jpol++) + { + sigma(ipol, jpol) = sigma_onsite[ipol * 3 + jpol]; + } + } + // do symmetry + if (ModuleSymmetry::Symmetry::symm_flag == 1) + { + p_symm->symmetrize_mat3(sigma, ucell_in.lat); + } // end symmetry + + ModuleBase::timer::tick("Stress_Func", "stress_onsite"); +} + +template class Stress_Func; +#if ((defined __CUDA) || (defined __ROCM)) +template class Stress_Func; +#endif \ No newline at end of file diff --git a/source/module_hamilt_pw/hamilt_pwdft/stress_pw.cpp b/source/module_hamilt_pw/hamilt_pwdft/stress_pw.cpp index a566dda0c5..b9910b27a8 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/stress_pw.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/stress_pw.cpp @@ -46,6 +46,9 @@ void Stress_PW::cal_stress(ModuleBase::matrix& sigmatot, // vdw stress ModuleBase::matrix sigmavdw; sigmavdw.create(3, 3); + // DFT+U and DeltaSpin stress + ModuleBase::matrix sigmaonsite; + sigmaonsite.create(3, 3); for (int i = 0; i < 3; i++) { @@ -60,6 +63,7 @@ void Stress_PW::cal_stress(ModuleBase::matrix& sigmatot, sigmaewa(i, j) = 0.0; sigmaxcc(i, j) = 0.0; sigmavdw(i, j) = 0.0; + sigmaonsite(i, j) = 0.0; } } @@ -107,13 +111,19 @@ void Stress_PW::cal_stress(ModuleBase::matrix& sigmatot, // vdw term stress_vdw(sigmavdw, ucell); + // DFT+U and DeltaSpin stress + if(PARAM.inp.dft_plus_u || PARAM.inp.sc_mag_switch) + { + this->stress_onsite(sigmaonsite, this->pelec->wg, wfc_basis, ucell, psi_in, p_symm); + } + for (int ipol = 0; ipol < 3; ipol++) { for (int jpol = 0; jpol < 3; jpol++) { sigmatot(ipol, jpol) = sigmakin(ipol, jpol) + sigmahar(ipol, jpol) + sigmanl(ipol, jpol) + sigmaxc(ipol, jpol) + sigmaxcc(ipol, jpol) + sigmaewa(ipol, jpol) - + sigmaloc(ipol, jpol) + sigmavdw(ipol, jpol); + + sigmaloc(ipol, jpol) + sigmavdw(ipol, jpol) + sigmaonsite(ipol, jpol); } } @@ -138,6 +148,10 @@ void Stress_PW::cal_stress(ModuleBase::matrix& sigmatot, ModuleIO::print_stress("XC STRESS", sigmaxc, GlobalV::TEST_STRESS, ry); ModuleIO::print_stress("EWALD STRESS", sigmaewa, GlobalV::TEST_STRESS, ry); ModuleIO::print_stress("NLCC STRESS", sigmaxcc, GlobalV::TEST_STRESS, ry); + if(PARAM.inp.dft_plus_u || PARAM.inp.sc_mag_switch) + { + ModuleIO::print_stress("ONSITE STRESS", sigmaonsite, GlobalV::TEST_STRESS, ry); + } ModuleIO::print_stress("TOTAL STRESS", sigmatot, GlobalV::TEST_STRESS, ry); } ModuleBase::timer::tick("Stress_PW", "cal_stress");