Merge pull request #18 from coltonbh/feature-terachem-optimizations

coltonbh · Sep 13, 2024 · 314394b · 314394b
2 parents ff80a44 + b26e0a4
commit 314394b
Show file tree

Hide file tree

Showing 12 changed files with 688 additions and 19 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -34,6 +34,10 @@ repos:
       - id: mypy
         additional_dependencies:
           [tokenize-rt==3.2.0, pydantic>=1.0.0, types-paramiko, types-toml, qcio>=0.11.8]
+  - repo: https://github.com/crate-ci/typos
+    rev: v1.24.5
+    hooks:
+      - id: typos
   - repo: local
     hooks:
       - id: tests

diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -10,7 +10,9 @@
         "maxiter",
         "natom",
         "natoms",
+        "ndarray",
         "nocuda",
+        "optim",
         "pathconf",
         "psutil",
         "qcel",

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,10 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 
 ## [unreleased]
 
+### Added
+
+- `parse_optimization_dir(...) -> OptimizationResults` for TeraChem.
+
 ## [0.6.2] - 2024-08-13
 
 ### Added

diff --git a/pyproject.toml b/pyproject.toml
@@ -57,3 +57,9 @@ init_forbid_extra = true
 init_typed = true
 warn_required_dynamic_aliases = true
 warn_untyped_fields = true
+
+[tool.typos]
+# Exclude specific files or directories
+files.extend-exclude = [
+    "tests/data/**", # Single file
+]
diff --git a/qcparse/encoders/terachem.py b/qcparse/encoders/terachem.py
@@ -3,7 +3,13 @@
 from qcparse.exceptions import EncoderError
 from qcparse.models import NativeInput
 
-SUPPORTED_CALCTYPES = {CalcType.energy, CalcType.gradient, CalcType.hessian}
+SUPPORTED_CALCTYPES = {
+    CalcType.energy,
+    CalcType.gradient,
+    CalcType.hessian,
+    CalcType.optimization,
+    CalcType.transition_state,
+}
 XYZ_FILENAME = "geometry.xyz"
 PADDING = 20  # padding between keyword and value in tc.in
 
@@ -21,6 +27,15 @@ def encode(inp_obj: ProgramInput) -> NativeInput:
     # calctype
     if inp_obj.calctype.value == CalcType.hessian:
         calctype = "frequencies"
+    elif inp_obj.calctype.value == CalcType.optimization:
+        calctype = "minimize"
+        if not inp_obj.keywords.get("new_minimizer", "no") == "yes":
+            raise EncoderError(
+                "Only the new_minimizer is supported for optimizations. Add "
+                "'new_minimizer': 'yes' to the keywords."
+            )
+    elif inp_obj.calctype.value == CalcType.transition_state:
+        calctype = "ts"
     else:
         calctype = inp_obj.calctype.value
 

diff --git a/qcparse/main.py b/qcparse/main.py
@@ -4,9 +4,9 @@
 import warnings
 from importlib import import_module
 from pathlib import Path
-from typing import List, Union
+from typing import List, Optional, Union
 
-from qcio import ProgramInput, SinglePointResults
+from qcio import CalcType, ProgramInput, SinglePointResults
 
 from .exceptions import EncoderError, MatchNotFoundError, ParserError
 from .models import NativeInput, ParserSpec, registry, single_point_results_namespace
@@ -20,6 +20,7 @@ def parse(
     data_or_path: Union[str, bytes, Path],
     program: str,
     filetype: str = "stdout",
+    calctype: Optional[CalcType] = None,
 ) -> SinglePointResults:
     """Parse a file using the parsers registered for the given program.
 
@@ -53,7 +54,7 @@ def parse(
 
     # Get the calctype if filetype is 'stdout'
     if filetype == "stdout":
-        calctype = parsers.parse_calctype(file_content)
+        calctype = calctype if calctype else parsers.parse_calctype(file_content)
 
     # Get all the parsers for the program, filetype, and calctype
     parser_specs: List[ParserSpec] = registry.get_parsers(program, filetype, calctype)

diff --git a/qcparse/parsers/terachem.py b/qcparse/parsers/terachem.py
@@ -1,8 +1,18 @@
 """Parsers for TeraChem output files."""
 
 import re
-
-from qcio import CalcType
+from pathlib import Path
+from typing import List, Optional, Union
+
+from qcio import (
+    CalcType,
+    OptimizationResults,
+    ProgramInput,
+    ProgramOutput,
+    Provenance,
+    SinglePointResults,
+    Structure,
+)
 
 from qcparse.exceptions import MatchNotFoundError
 from qcparse.models import FileType, ParsedDataCollector
@@ -38,21 +48,51 @@ def parse_energy(string: str, data_collector: ParsedDataCollector):
     data_collector.energy = float(regex_search(regex, string).group(1))
 
 
-@parser(only=[CalcType.gradient, CalcType.hessian])
-def parse_gradient(string: str, data_collector: ParsedDataCollector):
-    """Parse gradient from TeraChem stdout."""
+def parse_gradients(string: str, all: bool = True) -> List[List[List[float]]]:
+    """Parse gradients from TeraChem stdout.
+
+    Args:
+        string: The contents of the TeraChem stdout file.
+        all: If True, return all gradients. If False, return only the first gradient.
+
+    Returns:
+        A list of gradients. Each gradient is a list of 3-element lists, where each
+        3-element list is a gradient for an atom.
+    """
     # This will match all floats after the dE/dX dE/dY dE/dZ header and stop at the
-    # terminating ---- line
-    regex = r"(?<=dE\/dX\s{12}dE\/dY\s{12}dE\/dZ\n)[\d\.\-\s]+(?=\n-{2,})"
-    gradient_string = regex_search(regex, string).group()
+    # terminating -- or -= line that follows gradients or optimizations.
+    regex = r"(?<=dE\/dX\s{12}dE\/dY\s{12}dE\/dZ\n)[\d\.\-\s]+(?=\n(?:--|-=))"
+
+    if all is True:
+        match: Optional[Union[List, re.Match]] = re.findall(regex, string)
+    else:
+        match = re.search(regex, string)
+
+    if not match:
+        raise MatchNotFoundError(regex, string)
+
+    grad_strings: List[str] = match if all is True else [match.group()]  # type: ignore
+
+    gradients = []
 
-    # split string and cast to floats
-    values = [float(val) for val in gradient_string.split()]
+    for grad_string in grad_strings:
+        # split string and cast to floats
+        values = [float(val) for val in grad_string.split()]
 
-    # arrange into N x 3 gradient
-    gradient = []
-    for i in range(0, len(values), 3):
-        gradient.append(values[i : i + 3])
+        # arrange into N x 3 gradient
+        gradient = []
+        for i in range(0, len(values), 3):
+            gradient.append(values[i : i + 3])
+
+        gradients.append(gradient)
+
+    return gradients
+
+
+@parser(only=[CalcType.gradient, CalcType.hessian])
+def parse_gradient(string: str, data_collector: ParsedDataCollector):
+    """Parse first gradient from TeraChem stdout."""
+    gradient = parse_gradients(string, all=False)[0]
 
     data_collector.gradient = gradient
 
@@ -137,3 +177,65 @@ def calculation_succeeded(string: str) -> bool:
         # If any match for a failure regex is found, the calculation failed
         return True
     return False
+
+
+def parse_optimization_dir(
+    directory: Union[Path, str],
+    stdout: str,
+    *,
+    inp_obj: ProgramInput,
+) -> OptimizationResults:
+    """Parse the output directory of a TeraChem optimization calculation.
+
+    Args:
+        directory: Path to the directory containing the TeraChem output files.
+        stdout: The contents of the TeraChem stdout file.
+        inp_obj: The input object used for the calculation.
+
+    Returns:
+        OptimizationResults object
+    """
+    directory = Path(directory)
+
+    # Parse the structures
+    structures = Structure.open(directory / "optim.xyz")
+    assert isinstance(structures, list), "Expected multiple structures in optim.xyz"
+
+    # Parse Values
+    from qcparse import parse
+
+    # Parse all the values from the stdout file
+    spr = parse(stdout, "terachem", "stdout", CalcType.energy)
+
+    gradients = parse_gradients(stdout)
+    program_version = parse_version_string(stdout)
+
+    # Create the trajectory
+    trajectory: List[ProgramOutput] = [
+        ProgramOutput(
+            input_data=ProgramInput(
+                calctype=CalcType.gradient,
+                structure=structure,
+                model=inp_obj.model,
+                keywords=inp_obj.keywords,
+            ),
+            results=SinglePointResults(
+                **{
+                    **spr.model_dump(),
+                    # TeraChem places the energy as the first comment in the xyz file
+                    "energy": structure.extras[Structure._xyz_comment_key][0],
+                    # # Will be coerced by Pydantic to np.ndarray
+                    "gradient": gradient,  # type: ignore
+                }
+            ),
+            success=True,
+            provenance=Provenance(
+                program="terachem",
+                program_version=program_version,
+                scratch_dir=directory.parent,
+            ),
+        )
+        for structure, gradient in zip(structures, gradients)
+    ]
+
+    return OptimizationResults(trajectory=trajectory)
diff --git a/tests/data/gradients.py b/tests/data/gradients.py
@@ -57,3 +57,26 @@
     [4.973e-06, -3.1333e-06, -7.764e-07],
     [6.7704e-06, 2.1271e-06, -3.2742e-06],
 ]
+
+water_opt = [
+    [
+        [0.0015991486, 0.0011623983, -0.0008220843],
+        [-0.0033943838, 0.0083169673, 0.0080418029],
+        [0.0017952356, -0.0094793661, -0.0072197183],
+    ],
+    [
+        [0.0007735883, 0.0005355768, -0.0003640441],
+        [0.0001656072, -0.0021072255, -0.0013624922],
+        [-0.0009391978, 0.0015716465, 0.0017265376],
+    ],
+    [
+        [0.0001367065, 7.05798e-05, -3.25781e-05],
+        [-5.57745e-05, -3.29821e-05, 4.5913e-05],
+        [-8.09322e-05, -3.7596e-05, -1.33348e-05],
+    ],
+    [
+        [3.50513e-05, -5.6371e-06, 1.96736e-05],
+        [-4.546e-07, -7.6209e-06, 8.1239e-06],
+        [-3.4594e-05, 1.32576e-05, -2.78001e-05],
+    ],
+]
diff --git a/tests/data/terachem_opt/optim.xyz b/tests/data/terachem_opt/optim.xyz
@@ -0,0 +1,20 @@
+3
+-7.6408224649875081e+01 frame 0   xyz file generated by TeraChem
+     O     0.0119228928    0.0093402642   -0.0064820105
+     H     0.2650769983    0.9093318768    0.2741564484
+     H     0.6072002355   -0.2640720907   -0.7301744291
+3
+-7.6408928705141307e+01 frame 1   xyz file generated by TeraChem
+     O    -0.0028251031   -0.0013840791    0.0011984340
+     H     0.2798216509    0.8896108556    0.2484225203
+     H     0.6072035665   -0.2336267117   -0.7121209539
+3
+-7.6408947912944399e+01 frame 2   xyz file generated by TeraChem
+     O    -0.0014561937   -0.0002263064    0.0002998534
+     H     0.2776781764    0.8928649961    0.2515321133
+     H     0.6079781950   -0.2380385656   -0.7143320023
+3
+-7.6408948109186298e+01 frame 3   xyz file generated by TeraChem
+     O    -0.0017239244   -0.0002612159    0.0002482950
+     H     0.2774292998    0.8930138681    0.2510061731
+     H     0.6084948100   -0.2381525793   -0.7137545065