1.1 Release (#228)

* Remove unreachable code * Bump version * Missing functions * updated readme --------- Co-authored-by: Qiming Sun <[email protected]> Co-authored-by: xiaojie.wu <[email protected]>
pyscf · Oct 30, 2024 · 2cdca60 · 2cdca60
1 parent e83eab4
commit 2cdca60
Show file tree

Hide file tree

Showing 9 changed files with 31 additions and 16 deletions.
diff --git a/README.md b/README.md
@@ -51,14 +51,15 @@ Features
 - Unrestricted Hartree-Fock and Unrestricted DFT, gradient, and Hessian;
 - MP2/DF-MP2 and CCSD (experimental);
 - Polarizability, IR, and NMR shielding
+- QM/MM with PBC
+- CHELPG, ESP, and RESP atomic charge
 
 Limitations
 --------
 - Rys roots up to 9 for density fitting scheme and direct scf scheme;
 - Atomic basis up to g orbitals;
 - Auxiliary basis up to i orbitals;
 - Density fitting scheme up to ~168 atoms with def2-tzvpd basis, bounded by CPU memory;
-- Hessian is unavailable for Direct SCF yet;
 - meta-GGA without density laplacian;
 - Double hybrid functionals are not supported;
 

diff --git a/gpu4pyscf/__init__.py b/gpu4pyscf/__init__.py
@@ -1,4 +1,4 @@
-__version__ = '1.0.2'
+__version__ = '1.1.0'
 
 # monkey patch libxc reference due to a bug in nvcc
 from pyscf.dft import libxc

diff --git a/gpu4pyscf/lib/gvhf-rys/unrolled_ejk_ip1.cu b/gpu4pyscf/lib/gvhf-rys/unrolled_ejk_ip1.cu
@@ -1,6 +1,8 @@
 #include "vhf.cuh"
 #include "rys_roots_unrolled.cu"
 #include "create_tasks_ip1.cu"
+int rys_ejk_ip1_unrolled_lmax = 2;
+int rys_ejk_ip1_unrolled_max_order = 4;
 
 
 __device__ static

diff --git a/gpu4pyscf/lib/gvhf-rys/unrolled_os.cu b/gpu4pyscf/lib/gvhf-rys/unrolled_os.cu
@@ -2,6 +2,8 @@
 #include "vhf.cuh"
 #include "gamma_inc_unrolled.cu"
 #include "create_tasks.cu"
+int os_jk_unrolled_lmax = 1;
+int os_jk_unrolled_max_order = 0;
 
 
 __device__ static

diff --git a/gpu4pyscf/lib/gvhf-rys/unrolled_rys.cu b/gpu4pyscf/lib/gvhf-rys/unrolled_rys.cu
@@ -2,6 +2,9 @@
 #include "vhf.cuh"
 #include "rys_roots_unrolled.cu"
 #include "create_tasks.cu"
+int rys_jk_unrolled_lmax = 3;
+int rys_jk_unrolled_max_order = 4;
+int rys_jk_unrolled_max_nf = 60;
 
 
 __device__ static

diff --git a/gpu4pyscf/lib/gvhf-rys/unrolled_rys_ip1.cu b/gpu4pyscf/lib/gvhf-rys/unrolled_rys_ip1.cu
@@ -2,6 +2,9 @@
 #include "vhf.cuh"
 #include "rys_roots_unrolled.cu"
 #include "create_tasks_ip1.cu"
+int rys_vjk_ip1_unrolled_lmax = 2;
+int rys_vjk_ip1_unrolled_max_order = 4;
+int rys_vjk_ip1_unrolled_max_nf = 60;
 
 
 __device__ static

diff --git a/gpu4pyscf/lib/gvhf-rys/unrolled_rys_j.cu b/gpu4pyscf/lib/gvhf-rys/unrolled_rys_j.cu
@@ -2,6 +2,9 @@
 #include "vhf.cuh"
 #include "rys_roots_unrolled.cu"
 #include "create_tasks.cu"
+int rys_j_unrolled_lmax = 4;
+int rys_j_unrolled_max_order = 6;
+int rys_j_unrolled_max_gout_size = 90;
 
 
 __device__ static

diff --git a/gpu4pyscf/lib/gvhf/CMakeLists.txt b/gpu4pyscf/lib/gvhf/CMakeLists.txt
@@ -19,13 +19,13 @@
 
 add_library(gvhf SHARED
   constant.cu
-  nr_jk_driver.cu
+  #nr_jk_driver.cu
   nr_jk_driver_int3c2e_ip1.cu
   nr_jk_driver_int3c2e_ip2.cu
   nr_jk_driver_int3c2e_pass1.cu
   nr_jk_driver_int3c2e_pass2.cu
-  nr_jk_driver_ip1.cu
-        get_veff_driver_ip1.cu
+  #nr_jk_driver_ip1.cu
+  #get_veff_driver_ip1.cu
 )
 
 #option(BUILD_SHARED_LIBS "build shared libraries" 1)

diff --git a/gpu4pyscf/scf/jk.py b/gpu4pyscf/scf/jk.py
@@ -16,13 +16,20 @@
     'get_jk', 'get_j',
 ]
 
+libvhf_rys = load_library('libgvhf_rys')
+libvhf_rys.RYS_build_jk.restype = ctypes.c_int
+libvhf_rys.cuda_version.restype = ctypes.c_int
+CUDA_VERSION = libvhf_rys.cuda_version()
+
 PTR_BAS_COORD = 7
 LMAX = 4
 TILE = 2
 QUEUE_DEPTH = 262144
-UNROLL_ORDER = 4
-UNROLL_LMAX = 3
-UNROLL_NFMAX = 60
+UNROLL_ORDER = ctypes.c_int.in_dll(libvhf_rys, 'rys_jk_unrolled_max_order').value
+UNROLL_LMAX = ctypes.c_int.in_dll(libvhf_rys, 'rys_jk_unrolled_lmax').value
+UNROLL_NFMAX = ctypes.c_int.in_dll(libvhf_rys, 'rys_jk_unrolled_max_nf').value
+UNROLL_J_LMAX = ctypes.c_int.in_dll(libvhf_rys, 'rys_j_unrolled_lmax').value
+UNROLL_J_MAX_ORDER = ctypes.c_int.in_dll(libvhf_rys, 'rys_j_unrolled_max_order').value
 GOUT_WIDTH = 42
 SHM_SIZE = getattr(__config__, 'GPU_SHM_SIZE',
                    int(gpu_specs['sharedMemPerBlockOptin']//9)*8)
@@ -31,11 +38,6 @@
 # TODO: test different size for L2 cache efficiency
 NAO_IN_GROUP = 1500
 
-libvhf_rys = load_library('libgvhf_rys')
-libvhf_rys.RYS_build_jk.restype = ctypes.c_int
-libvhf_rys.cuda_version.restype = ctypes.c_int
-CUDA_VERSION = libvhf_rys.cuda_version()
-
 def get_jk(mol, dm, hermi=1, vhfopt=None, with_j=True, with_k=True, verbose=None):
     '''Compute J, K matrices
     '''
@@ -705,9 +707,8 @@ def _j_engine_quartets_scheme(mol, l_ctr_pattern, shm_size=SHM_SIZE):
     nf3_ij = (lij+1)*(lij+2)*(lij+3)//6
     nf3_kl = (lkl+1)*(lkl+2)*(lkl+3)//6
     nroots = order // 2 + 1
-    lmax = 4  # not angular momentum of orbital basis. see rys_contract_j kernel
-    max_order = 6
-    if order <= max_order and lij <= lmax and lkl <= lmax:
+    # UNROLL_J_LMAX is different to UNROLL_LMAX of orbital basis. see rys_contract_j kernel
+    if order <= UNROLL_J_MAX_ORDER and lij <= UNROLL_J_LMAX and lkl <= UNROLL_J_LMAX:
         if CUDA_VERSION >= 12040 and order <= 2:
             return 512, 1, False
         return 256, 1, False