Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature terachem optimizations #18

Merged
merged 2 commits into from
Sep 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@ repos:
- id: mypy
additional_dependencies:
[tokenize-rt==3.2.0, pydantic>=1.0.0, types-paramiko, types-toml, qcio>=0.11.8]
- repo: https://github.com/crate-ci/typos
rev: v1.24.5
hooks:
- id: typos
- repo: local
hooks:
- id: tests
Expand Down
2 changes: 2 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@
"maxiter",
"natom",
"natoms",
"ndarray",
"nocuda",
"optim",
"pathconf",
"psutil",
"qcel",
Expand Down
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),

## [unreleased]

### Added

- `parse_optimization_dir(...) -> OptimizationResults` for TeraChem.

## [0.6.2] - 2024-08-13

### Added
Expand Down
6 changes: 6 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -57,3 +57,9 @@ init_forbid_extra = true
init_typed = true
warn_required_dynamic_aliases = true
warn_untyped_fields = true

[tool.typos]
# Exclude specific files or directories
files.extend-exclude = [
"tests/data/**", # Single file
]
17 changes: 16 additions & 1 deletion qcparse/encoders/terachem.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,13 @@
from qcparse.exceptions import EncoderError
from qcparse.models import NativeInput

SUPPORTED_CALCTYPES = {CalcType.energy, CalcType.gradient, CalcType.hessian}
SUPPORTED_CALCTYPES = {
CalcType.energy,
CalcType.gradient,
CalcType.hessian,
CalcType.optimization,
CalcType.transition_state,
}
XYZ_FILENAME = "geometry.xyz"
PADDING = 20 # padding between keyword and value in tc.in

Expand All @@ -21,6 +27,15 @@ def encode(inp_obj: ProgramInput) -> NativeInput:
# calctype
if inp_obj.calctype.value == CalcType.hessian:
calctype = "frequencies"
elif inp_obj.calctype.value == CalcType.optimization:
calctype = "minimize"
if not inp_obj.keywords.get("new_minimizer", "no") == "yes":
raise EncoderError(
"Only the new_minimizer is supported for optimizations. Add "
"'new_minimizer': 'yes' to the keywords."
)
elif inp_obj.calctype.value == CalcType.transition_state:
calctype = "ts"
else:
calctype = inp_obj.calctype.value

Expand Down
7 changes: 4 additions & 3 deletions qcparse/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@
import warnings
from importlib import import_module
from pathlib import Path
from typing import List, Union
from typing import List, Optional, Union

from qcio import ProgramInput, SinglePointResults
from qcio import CalcType, ProgramInput, SinglePointResults

from .exceptions import EncoderError, MatchNotFoundError, ParserError
from .models import NativeInput, ParserSpec, registry, single_point_results_namespace
Expand All @@ -20,6 +20,7 @@ def parse(
data_or_path: Union[str, bytes, Path],
program: str,
filetype: str = "stdout",
calctype: Optional[CalcType] = None,
) -> SinglePointResults:
"""Parse a file using the parsers registered for the given program.
Expand Down Expand Up @@ -53,7 +54,7 @@ def parse(

# Get the calctype if filetype is 'stdout'
if filetype == "stdout":
calctype = parsers.parse_calctype(file_content)
calctype = calctype if calctype else parsers.parse_calctype(file_content)

# Get all the parsers for the program, filetype, and calctype
parser_specs: List[ParserSpec] = registry.get_parsers(program, filetype, calctype)
Expand Down
130 changes: 116 additions & 14 deletions qcparse/parsers/terachem.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,18 @@
"""Parsers for TeraChem output files."""

import re

from qcio import CalcType
from pathlib import Path
from typing import List, Optional, Union

from qcio import (
CalcType,
OptimizationResults,
ProgramInput,
ProgramOutput,
Provenance,
SinglePointResults,
Structure,
)

from qcparse.exceptions import MatchNotFoundError
from qcparse.models import FileType, ParsedDataCollector
Expand Down Expand Up @@ -38,21 +48,51 @@ def parse_energy(string: str, data_collector: ParsedDataCollector):
data_collector.energy = float(regex_search(regex, string).group(1))


@parser(only=[CalcType.gradient, CalcType.hessian])
def parse_gradient(string: str, data_collector: ParsedDataCollector):
"""Parse gradient from TeraChem stdout."""
def parse_gradients(string: str, all: bool = True) -> List[List[List[float]]]:
"""Parse gradients from TeraChem stdout.

Args:
string: The contents of the TeraChem stdout file.
all: If True, return all gradients. If False, return only the first gradient.

Returns:
A list of gradients. Each gradient is a list of 3-element lists, where each
3-element list is a gradient for an atom.
"""
# This will match all floats after the dE/dX dE/dY dE/dZ header and stop at the
# terminating ---- line
regex = r"(?<=dE\/dX\s{12}dE\/dY\s{12}dE\/dZ\n)[\d\.\-\s]+(?=\n-{2,})"
gradient_string = regex_search(regex, string).group()
# terminating -- or -= line that follows gradients or optimizations.
regex = r"(?<=dE\/dX\s{12}dE\/dY\s{12}dE\/dZ\n)[\d\.\-\s]+(?=\n(?:--|-=))"

if all is True:
match: Optional[Union[List, re.Match]] = re.findall(regex, string)
else:
match = re.search(regex, string)

if not match:
raise MatchNotFoundError(regex, string)

grad_strings: List[str] = match if all is True else [match.group()] # type: ignore

gradients = []

# split string and cast to floats
values = [float(val) for val in gradient_string.split()]
for grad_string in grad_strings:
# split string and cast to floats
values = [float(val) for val in grad_string.split()]

# arrange into N x 3 gradient
gradient = []
for i in range(0, len(values), 3):
gradient.append(values[i : i + 3])
# arrange into N x 3 gradient
gradient = []
for i in range(0, len(values), 3):
gradient.append(values[i : i + 3])

gradients.append(gradient)

return gradients


@parser(only=[CalcType.gradient, CalcType.hessian])
def parse_gradient(string: str, data_collector: ParsedDataCollector):
"""Parse first gradient from TeraChem stdout."""
gradient = parse_gradients(string, all=False)[0]

data_collector.gradient = gradient

Expand Down Expand Up @@ -137,3 +177,65 @@ def calculation_succeeded(string: str) -> bool:
# If any match for a failure regex is found, the calculation failed
return True
return False


def parse_optimization_dir(
directory: Union[Path, str],
stdout: str,
*,
inp_obj: ProgramInput,
) -> OptimizationResults:
"""Parse the output directory of a TeraChem optimization calculation.

Args:
directory: Path to the directory containing the TeraChem output files.
stdout: The contents of the TeraChem stdout file.
inp_obj: The input object used for the calculation.

Returns:
OptimizationResults object
"""
directory = Path(directory)

# Parse the structures
structures = Structure.open(directory / "optim.xyz")
assert isinstance(structures, list), "Expected multiple structures in optim.xyz"

# Parse Values
from qcparse import parse

# Parse all the values from the stdout file
spr = parse(stdout, "terachem", "stdout", CalcType.energy)

gradients = parse_gradients(stdout)
program_version = parse_version_string(stdout)

# Create the trajectory
trajectory: List[ProgramOutput] = [
ProgramOutput(
input_data=ProgramInput(
calctype=CalcType.gradient,
structure=structure,
model=inp_obj.model,
keywords=inp_obj.keywords,
),
results=SinglePointResults(
**{
**spr.model_dump(),
# TeraChem places the energy as the first comment in the xyz file
"energy": structure.extras[Structure._xyz_comment_key][0],
# # Will be coerced by Pydantic to np.ndarray
"gradient": gradient, # type: ignore
}
),
success=True,
provenance=Provenance(
program="terachem",
program_version=program_version,
scratch_dir=directory.parent,
),
)
for structure, gradient in zip(structures, gradients)
]

return OptimizationResults(trajectory=trajectory)
23 changes: 23 additions & 0 deletions tests/data/gradients.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,3 +57,26 @@
[4.973e-06, -3.1333e-06, -7.764e-07],
[6.7704e-06, 2.1271e-06, -3.2742e-06],
]

water_opt = [
[
[0.0015991486, 0.0011623983, -0.0008220843],
[-0.0033943838, 0.0083169673, 0.0080418029],
[0.0017952356, -0.0094793661, -0.0072197183],
],
[
[0.0007735883, 0.0005355768, -0.0003640441],
[0.0001656072, -0.0021072255, -0.0013624922],
[-0.0009391978, 0.0015716465, 0.0017265376],
],
[
[0.0001367065, 7.05798e-05, -3.25781e-05],
[-5.57745e-05, -3.29821e-05, 4.5913e-05],
[-8.09322e-05, -3.7596e-05, -1.33348e-05],
],
[
[3.50513e-05, -5.6371e-06, 1.96736e-05],
[-4.546e-07, -7.6209e-06, 8.1239e-06],
[-3.4594e-05, 1.32576e-05, -2.78001e-05],
],
]
20 changes: 20 additions & 0 deletions tests/data/terachem_opt/optim.xyz
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
3
-7.6408224649875081e+01 frame 0 xyz file generated by TeraChem
O 0.0119228928 0.0093402642 -0.0064820105
H 0.2650769983 0.9093318768 0.2741564484
H 0.6072002355 -0.2640720907 -0.7301744291
3
-7.6408928705141307e+01 frame 1 xyz file generated by TeraChem
O -0.0028251031 -0.0013840791 0.0011984340
H 0.2798216509 0.8896108556 0.2484225203
H 0.6072035665 -0.2336267117 -0.7121209539
3
-7.6408947912944399e+01 frame 2 xyz file generated by TeraChem
O -0.0014561937 -0.0002263064 0.0002998534
H 0.2776781764 0.8928649961 0.2515321133
H 0.6079781950 -0.2380385656 -0.7143320023
3
-7.6408948109186298e+01 frame 3 xyz file generated by TeraChem
O -0.0017239244 -0.0002612159 0.0002482950
H 0.2774292998 0.8930138681 0.2510061731
H 0.6084948100 -0.2381525793 -0.7137545065
Loading
Loading