Merge pull request #19 from coltonbh/feature-expand-crest

coltonbh · Oct 2, 2024 · 54b6755 · 54b6755
2 parents e123ca9 + 07bad33
commit 54b6755
Show file tree

Hide file tree

Showing 11 changed files with 794 additions and 8 deletions.
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -20,6 +20,8 @@
         "qcparse",
         "rotamer",
         "rotamers",
+        "runtypes",
+        "singlepoint",
         "spinmult",
         "tcin",
         "tcout",

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,10 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 
 ## [unreleased]
 
+### Added
+
+- Encoders and parsers for CREST to support `energy`, `gradient`, `hessian`, and `optimization` calculations.
+
 ## [0.6.3] - 2024-09-12
 
 ### Added

diff --git a/qcparse/encoders/crest.py b/qcparse/encoders/crest.py
@@ -8,7 +8,13 @@
 from qcparse.exceptions import EncoderError
 from qcparse.models import NativeInput
 
-SUPPORTED_CALCTYPES = {CalcType.conformer_search}
+SUPPORTED_CALCTYPES = {
+    CalcType.conformer_search,
+    CalcType.optimization,
+    CalcType.energy,
+    CalcType.gradient,
+    CalcType.hessian,
+}
 
 
 def encode(inp_obj: ProgramInput) -> NativeInput:
@@ -42,13 +48,47 @@ def validate_input(inp_obj: ProgramInput):
     """
     # These values come from other parts of the ProgramInput and should not be set
     # in the keywords.
-    non_allowed_keywords = ["charge", "uhf", "runtype"]
+    non_allowed_keywords = ["charge", "uhf"]
     for keyword in non_allowed_keywords:
         if keyword in inp_obj.keywords:
             raise EncoderError(
                 f"{keyword} should not be set in keywords for CREST. It is already set "
                 "on the Structure or ProgramInput elsewhere.",
             )
+    if "runtype" in inp_obj.keywords:
+        _validate_runtype_calctype(inp_obj.keywords["runtype"], inp_obj.calctype)
+
+
+def _validate_runtype_calctype(runtype: str, calctype: CalcType):
+    """Validate that the runtype is supported for the calctype."""
+    invalid_runtype = False
+    valid_runtypes = set()
+
+    if calctype == CalcType.conformer_search:
+        valid_runtypes = {"imtd-gc", "imtd-smtd", "entropy", "nci", "nci-mtd"}
+        if runtype not in valid_runtypes:
+            invalid_runtype = True
+
+    elif calctype == CalcType.optimization:
+        valid_runtypes = {"optimize", "ancopt"}
+        if runtype not in valid_runtypes:
+            invalid_runtype = True
+
+    elif calctype in {CalcType.energy, CalcType.gradient}:
+        valid_runtypes = {"singlepoint"}
+        if runtype not in valid_runtypes:
+            invalid_runtype = True
+
+    elif calctype == CalcType.hessian:
+        valid_runtypes = {"numhess"}
+        if runtype not in valid_runtypes:
+            invalid_runtype = True
+
+    if invalid_runtype:
+        raise EncoderError(
+            f"Unsupported runtype {runtype} for calctype {calctype}. Valid runtypes "
+            f"are: {valid_runtypes}.",
+        )
 
 
 def _to_toml_dict(inp_obj: ProgramInput, struct_filename: str) -> Dict[str, Any]:
@@ -64,8 +104,20 @@ def _to_toml_dict(inp_obj: ProgramInput, struct_filename: str) -> Dict[str, Any]
     toml_dict.setdefault("threads", os.cpu_count())
     toml_dict["input"] = struct_filename
 
-    # TODO: May need to deal with non-covalent mode at some point
-    toml_dict["runtype"] = "imtd-gc"
+    # Set default runtype if not already set
+    if "runtype" not in inp_obj.keywords:
+        if inp_obj.calctype == CalcType.conformer_search:
+            toml_dict["runtype"] = "imtd-gc"
+        elif inp_obj.calctype == CalcType.optimization:
+            toml_dict["runtype"] = "optimize"
+        elif inp_obj.calctype in {CalcType.energy, CalcType.gradient}:
+            toml_dict["runtype"] = "singlepoint"
+        elif inp_obj.calctype == CalcType.hessian:
+            toml_dict["runtype"] = "numhess"
+        else:
+            raise EncoderError(
+                f"Unsupported calctype {inp_obj.calctype} for CREST encoder.",
+            )
 
     # Calculation level keywords
     calculation = toml_dict.pop("calculation", {})

diff --git a/qcparse/parsers/crest.py b/qcparse/parsers/crest.py
@@ -1,8 +1,18 @@
+import re
 from pathlib import Path
-from typing import List, Optional, Union
+from typing import Any, Dict, List, Optional, Union
 
 import numpy as np
-from qcio import ConformerSearchResults, Structure
+from qcio import (
+    CalcType,
+    ConformerSearchResults,
+    OptimizationResults,
+    ProgramInput,
+    ProgramOutput,
+    Provenance,
+    SinglePointResults,
+    Structure,
+)
 
 from .utils import regex_search
 
@@ -87,3 +97,142 @@ def parse_conformer_search_dir(
         rotamers=rotamers,
         rotamer_energies=np.array(rotamer_energies),
     )
+
+
+def parse_energy_grad(text: str) -> SinglePointResults:
+    """Parse the output of a CREST energy and gradient calculation.
+
+    Args:
+        text: The text of the output file.
+
+    Returns:
+        The parsed energy and gradient as a SinglePointResults object.
+    """
+    # Parse the energy
+    energy_regex = r"# Energy \( Eh \)\n#*\n\s*([-\d.]+)"
+    gradient_regex = r"# Gradient \( Eh/a0 \)\n#\s*\n((?:\s*[-\d.]+\n)+)"
+
+    energy = float(regex_search(energy_regex, text).group(1))
+    gradient = np.array(
+        [float(x) for x in regex_search(gradient_regex, text).group(1).split()]
+    )
+    return SinglePointResults(
+        energy=energy,
+        gradient=gradient,
+    )
+
+
+def parse_singlepoint_dir(
+    directory: Union[Path, str], filename: str = "crest.engrad"
+) -> SinglePointResults:
+    """Parse the output directory of a CREST single point calculation.
+
+    Args:
+        directory: Path to the directory containing the CREST output files.
+        filename: The name of the file containing the single point results.
+            Default is 'crest.engrad'.
+
+    Returns:
+        The parsed single point results as a SinglePointResults object.
+    """
+    directory = Path(directory)
+    text = (directory / filename).read_text()
+
+    return parse_energy_grad(text)
+
+
+def parse_numhess_dir(
+    directory: Union[Path, str],
+    filename: str = "numhess1",
+    stdout: Optional[str] = None,
+) -> SinglePointResults:
+    """Parse the output directory of a CREST numerical Hessian calculation.
+
+    Args:
+        directory: Path to the directory containing the CREST output files.
+        filename: The name of the file containing the numerical Hessian results.
+            Default is 'numhess1'.
+
+    Returns:
+        The parsed numerical Hessian results as a SinglePointResults object.
+    """
+    data = (Path(directory) / filename).read_text()
+    float_regex = r"[-+]?\d*\.\d+|\d+"
+    numbers = re.findall(float_regex, data)
+    array = np.array(numbers, dtype=float)
+    spr_dict: Dict[str, Any] = {"hessian": array}
+    if stdout:
+        energy_regex = r"Energy\s*=\s*([-+]?\d+\.\d+)\s*Eh"
+        energy = float(regex_search(energy_regex, stdout).group(1))
+        spr_dict["energy"] = energy
+    return SinglePointResults(**spr_dict)
+
+
+def parse_optimization_dir(
+    directory: Union[Path, str],
+    *,
+    inp_obj: ProgramInput,
+    stdout: str,
+) -> OptimizationResults:
+    """Parse the output directory of a CREST optimization calculation.
+
+    Args:
+        directory: Path to the directory containing the CREST output files.
+        inp_obj: The qcio ProgramInput object for the optimization.
+        stdout: The stdout from CREST.
+
+    Returns:
+        The parsed optimization results as a OptimizationResults object.
+    """
+    # Read in the xyz file containing the trajectory
+    directory = Path(directory)
+    xyz_text = (directory / "crestopt.log").read_text()
+
+    # Parse structures and energies from the xyz file
+    structures = Structure.from_xyz_multi(
+        xyz_text,
+        charge=inp_obj.structure.charge,
+        multiplicity=inp_obj.structure.multiplicity,
+    )
+    energies = [
+        float(struct.extras[Structure._xyz_comment_key][1]) for struct in structures
+    ]
+
+    # Fake gradient for each step because CREST does not output it
+    fake_gradient = np.zeros(len(inp_obj.structure.symbols) * 3)
+
+    # Parse program version
+    program_version = parse_version_string(stdout)
+
+    # Collect final gradient if calculation succeeded
+    try:
+        final_spr = parse_singlepoint_dir(directory)
+    except FileNotFoundError:
+        # Calculation failed, so we don't have the final energy or gradient
+        final_spr = SinglePointResults(gradient=fake_gradient)
+
+    # Create the optimization trajectory
+    trajectory: List[ProgramOutput] = [
+        ProgramOutput(
+            input_data=ProgramInput(
+                calctype=CalcType.gradient,
+                structure=struct,
+                model=inp_obj.model,
+            ),
+            success=True,
+            results=SinglePointResults(energy=energy, gradient=fake_gradient),
+            provenance=Provenance(
+                program="crest",
+                program_version=program_version,
+            ),
+        )
+        for struct, energy in zip(structures, energies)
+    ]
+
+    # Fill in final gradient
+    # https://github.com/crest-lab/crest/issues/354
+    trajectory[-1].results.gradient[:] = final_spr.gradient
+
+    return OptimizationResults(
+        trajectory=trajectory,
+    )
diff --git a/tests/data/crest_output/crest.engrad b/tests/data/crest_output/crest.engrad
@@ -0,0 +1,20 @@
+#
+# Atoms
+#
+     3
+#
+# Energy ( Eh )
+#
+       -0.335557824179335
+#
+# Gradient ( Eh/a0 )
+#
+       -0.005962071557911
+       -0.004419818102026
+        0.003139227894649
+        0.003048425211480
+        0.001982394235964
+       -0.001779667371498
+        0.002913646346432
+        0.002437423866062
+       -0.001359560523152
diff --git a/tests/data/crest_output/crestopt.log b/tests/data/crest_output/crestopt.log
@@ -0,0 +1,65 @@
+  3
+ Etot=          -4.7918798035
+ O         -0.0934852751       -0.0692099762        0.0488995019
+ H          0.4307247549        1.6181264838        0.5296458319
+ H          1.0532005349       -0.5195317162       -1.3058451581
+  3
+ Etot=          -4.9229264187
+ O          0.1136777730        0.0841383057       -0.0594737049
+ H          0.3707615594        1.3916895508        0.4552315157
+ H          0.9060006825       -0.4464430651       -1.1230576351
+  3
+ Etot=          -5.0483521241
+ O          0.2573182669        0.1904912574       -0.1346012710
+ H          0.3599743190        1.1289027403        0.3128185924
+ H          0.7731474289       -0.2900092063       -0.9055171457
+  3
+ Etot=          -5.0170597610
+ O          0.1917723991        0.1420049385       -0.1002932645
+ H          0.4678576011        0.8951550442        0.0741564841
+ H          0.7308100147       -0.0077751914       -0.7011630439
+  3
+ Etot=          -5.0491088307
+ O          0.0623819936        0.0460852580       -0.0326872105
+ H          0.5219250724        0.9797836792        0.0717912538
+ H          0.8061329488        0.0035158541       -0.7664038677
+  3
+ Etot=          -5.0660326493
+ O          0.1987775715        0.1472783377       -0.1039067694
+ H          0.4093039819        1.0814532336        0.2382238721
+ H          0.7823584615       -0.1993467800       -0.8616169270
+  3
+ Etot=          -5.0730217268
+ O          0.1770402418        0.1302188641       -0.0930982848
+ H          0.4376238607        1.0313507655        0.1821152037
+ H          0.7757759123       -0.1321848383       -0.8163167432
+  3
+ Etot=          -5.0723078880
+ O          0.1736292273        0.1445536976       -0.0815172819
+ H          0.4380419413        1.0059759479        0.1669722835
+ H          0.7787688463       -0.1211448541       -0.8127548260
+  3
+ Etot=          -5.0683269168
+ O          0.1851362467        0.1179180760       -0.1079633483
+ H          0.4523839201        0.9974657046        0.1483557889
+ H          0.7529198480       -0.0859989893       -0.7676922650
+  3
+ Etot=          -5.0732163707
+ O          0.1760746513        0.1281388107       -0.0933864784
+ H          0.4401853431        1.0270846083        0.1771945599
+ H          0.7741800205       -0.1258386276       -0.8111079058
+  3
+ Etot=          -5.0733841586
+ O          0.1769285244        0.1328773064       -0.0914470659
+ H          0.4405581451        1.0164576782        0.1706642632
+ H          0.7729533454       -0.1199501932       -0.8065170216
+  3
+ Etot=          -5.0734021646
+ O          0.1782533830        0.1321019587       -0.0931605147
+ H          0.4400623060        1.0186302961        0.1723993514
+ H          0.7721243259       -0.1213474635       -0.8065386611
+  3
+ Etot=          -5.0734025156
+ O          0.1788766949        0.1323436930       -0.0936142242
+ H          0.4397631667        1.0187613429        0.1727606527
+ H          0.7718001532       -0.1217202445       -0.8064462529
diff --git a/tests/data/crest_output/numhess1 b/tests/data/crest_output/numhess1
@@ -0,0 +1,20 @@
+ $hessian
+      0.02040569     -0.00018059      0.02080099     -0.02081319      0.01511689
+      0.00867078      0.00037976     -0.01495837     -0.02946283
+     -0.00018059     -0.01341723     -0.03209513      0.01368595      0.03374600
+      0.01874084     -0.01351862     -0.02035995      0.01336374
+      0.02080099     -0.03209513      0.00327178      0.00784908      0.01737681
+     -0.01812512     -0.02863169      0.01472059      0.01485103
+     -0.02081319      0.01368595      0.00784908      0.01933555     -0.01625843
+     -0.00694960      0.00149263      0.00258608     -0.00090575
+      0.01511689      0.03374600      0.01737681     -0.01625843     -0.03409225
+     -0.01710500      0.00114214      0.00035657     -0.00027546
+      0.00867078      0.01874084     -0.01812512     -0.00694960     -0.01710500
+      0.01843539     -0.00173455     -0.00164242     -0.00030677
+      0.00037976     -0.01351862     -0.02863169      0.00149263      0.00114214
+     -0.00173455     -0.00185964      0.01238496      0.03036359
+     -0.01495837     -0.02035995      0.01472059      0.00258608      0.00035657
+     -0.00164242      0.01238496      0.02002423     -0.01308397
+     -0.02946283      0.01336374      0.01485103     -0.00090575     -0.00027546
+     -0.00030677      0.03036359     -0.01308397     -0.01454546
+ $end