Skip to content

Commit

Permalink
Added encoders for TeraChem and the general encoder workflow. Added F…
Browse files Browse the repository at this point in the history
…ileType.stdout as default filetype for parser decorator to reduce verbosity of parser code.
  • Loading branch information
coltonbh committed Sep 28, 2023
1 parent 7035cde commit d049a77
Show file tree
Hide file tree
Showing 19 changed files with 292 additions and 72 deletions.
2 changes: 2 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,15 @@
"CUDA",
"Hartree",
"htmlcov",
"maxiter",
"natom",
"natoms",
"nocuda",
"pathconf",
"qcel",
"qcio",
"qcparse",
"spinmult",
"tcin",
"tcout",
"tcparse",
Expand Down
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,14 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),

- All input parsing details from the library.

### Added

- `encode` top level function and encoder for TeraChem input files.

### Changed

- Added `FileType.stdout` as default `filetype` argument to `parse` decorator to reduce boilerplate in parsers.

## [0.5.1] - 2023-09-19

### Changed
Expand Down
6 changes: 3 additions & 3 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@ Hey there 👋! Look at you wanting to contribute! This package is designed to m

## TL;DR - How to add new parsers

1. Create a file in the `parsers` directory named after the quantum chemistry program, e.g., `qchem.py`.
2. Create `class FileType(str, Enum)` in the module registering the file types the parsers support.
1. Create a file in the `parsers` directory named after the quantum chemistry program, e.g., `terachem.py`.
2. Create a `SUPPORTED_FILETYPES` set in the module containing the file types the parsers support.
3. If `stdout` is a file type then create a `def get_calctype(string: str) -> CalcType` function that returns the `CalcType` for the file. One of `CalcType.energy`, `CalcType.gradient`, or `CalcType.hessian`.
4. Create simple parser functions that accept file data (`str|bytes`) and a `data_collector` object. The parser should 1) parse a single piece of data from the file, 2) cast it to the correct Python type and 3) set it on the output object at its corresponding location found on the `qcio.SinglePointResults` object. Register this parser by decorating it with the `@parser` decorator. The decorator must declare `filetype` and can optionally declare `required` (`True` by default), and `only` (`None` by default). See the `qcparse.utils.parser` decorator for details on what these mean.
4. Create simple parser functions that accept file data (`str | bytes`) and a `data_collector` object. The parser should 1) parse a single piece of data from the file, 2) cast it to the correct Python type and 3) set it on the output object at its corresponding location found on the `qcio.SinglePointResults` object. Register this parser by decorating it with the `@parser()` decorator. The decorator optionally accepts a `filetype` argument (`FileType.stdout` by default) and can declare keyword arguments `required` (`True` by default), and `only` (`None` by default). See the `qcparse.utils.parser` decorator for details on what these mean.

```py
@parser(filetype=FileType.stdout)
Expand Down
8 changes: 4 additions & 4 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ readme = "README.md"
[tool.poetry.dependencies]
python = "^3.8"
pydantic = ">=2.0.0"
qcio = ">=0.5.0"
qcio = ">=0.7.1"

[tool.poetry.group.dev.dependencies]
mypy = "^1.1.1"
Expand Down
4 changes: 2 additions & 2 deletions qcparse/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
__version__ = metadata.version(__name__)


from .main import parse, parse_results # noqa: F401
from .main import encode, parse, parse_results # noqa: F401
from .models import registry # noqa: F401

__all__ = ["parse", "parse_results", "registry"]
__all__ = ["parse", "parse_results", "encode", "registry"]
Empty file added qcparse/encoders/__init__.py
Empty file.
59 changes: 59 additions & 0 deletions qcparse/encoders/terachem.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
from qcio import CalcType, ProgramInput

from qcparse.exceptions import EncoderError
from qcparse.models import NativeInput

SUPPORTED_CALCTYPES = {CalcType.energy, CalcType.gradient, CalcType.hessian}
XYZ_FILENAME = "geometry.xyz"
PADDING = 20 # padding between keyword and value in tc.in


def encode(inp_obj: ProgramInput) -> NativeInput:
"""Translate a ProgramInput into TeraChem inputs files.
Args:
inp_obj: The qcio ProgramInput object for a computation.
Returns:
NativeInput with .input being a tc.in file and .geometry an xyz file.
"""

# calctype
if inp_obj.calctype.value == CalcType.hessian:
calctype = "frequencies"
else:
calctype = inp_obj.calctype.value

# Collect lines for input file
inp_lines = []
inp_lines.append(f"{'run':<{PADDING}} {calctype}")
# Molecule
inp_lines.append(f"{'coordinates':<{PADDING}} {XYZ_FILENAME}")
inp_lines.append(f"{'charge':<{PADDING}} {inp_obj.molecule.charge}")
inp_lines.append(f"{'spinmult':<{PADDING}} {inp_obj.molecule.multiplicity}")
# Model
inp_lines.append(f"{'method':<{PADDING}} {inp_obj.model.method}")
inp_lines.append(f"{'basis':<{PADDING}} {inp_obj.model.basis}")

# Keywords
non_keywords = {
"charge": ".molecule.charge",
"spinmult": ".molecule.multiplicity",
"run": ".calctype",
"basis": ".model.basis",
"method": ".model.method",
}
for key, value in inp_obj.keywords.items():
# Check for keywords that should be passed as structured data
if key in non_keywords:
raise EncoderError(
f"Keyword '{key}' should not be set as a keyword. It "
f"should be set at '{non_keywords[key]}'",
)
# Lowercase booleans
inp_lines.append(f"{key:<{PADDING}} {str(value).lower()}")
return NativeInput(
input_file="\n".join(inp_lines) + "\n", # End file with newline
geometry_file=inp_obj.molecule.to_xyz(),
geometry_filename=XYZ_FILENAME,
)
10 changes: 9 additions & 1 deletion qcparse/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,11 @@ class BaseError(Exception):
"""Base tcparse exceptions"""


class MatchNotFoundError(BaseError):
class ParserError(BaseError):
"""Base exception for parsers"""


class MatchNotFoundError(ParserError):
"""Exception raised when a parsing match is not found"""

def __init__(self, regex: str, string: str):
Expand All @@ -15,3 +19,7 @@ def __init__(self, regex: str, string: str):

class RegistryError(BaseError):
"""Exception raised when a registry error occurs"""


class EncoderError(BaseError):
"""Exception raised when a encoder error occurs"""
59 changes: 40 additions & 19 deletions qcparse/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,14 @@
from pathlib import Path
from typing import List, Union

from qcio import SinglePointResults
from qcio import ProgramInput, SinglePointResults

from .exceptions import MatchNotFoundError
from .models import ParserSpec, registry, single_point_results_namespace
from .exceptions import EncoderError, MatchNotFoundError, ParserError
from .models import NativeInput, ParserSpec, registry, single_point_results_namespace
from .parsers import * # noqa: F403 Ensure all parsers get registered
from .utils import get_file_contents

__all__ = ["parse", "parse_results", "registry"]
__all__ = ["parse", "parse_results", "encode", "registry"]


def parse(
Expand All @@ -40,36 +40,36 @@ def parse(
A SinglePointResults object containing the parsed data.
Raises:
ParserError: If no parsers are registered for the filetype of the program.
MatchNotFoundError: If a required parser fails to parse its data.
"""
file_content, _ = get_file_contents(data_or_path)
parsers = import_module(f"qcparse.parsers.{program}")

# Create a SinglePointResult namespace object to collect the parsed data
spr_namespace = single_point_results_namespace()
# Check that filetype is supported by the program's parsers
if filetype not in parsers.SUPPORTED_FILETYPES:
raise ParserError(f"filetype '{filetype}' not supported by {program} parsers.")

file_content = get_file_contents(data_or_path)

# Get the calctype if filetype is 'stdout'
if filetype == "stdout":
parse_calctype = import_module(f"qcparse.parsers.{program}").parse_calctype
calctype = parse_calctype(file_content)
calctype = parsers.parse_calctype(file_content)

else:
calctype = None

# Get all the parsers for the program and filetype
# Get all the parsers for the program, filetype, and calctype
parser_specs: List[ParserSpec] = registry.get_parsers(program, filetype, calctype)

# Create a SinglePointResult namespace object to collect the parsed data
data_collector = single_point_results_namespace()

# Apply parsers to the file content.
for ps in parser_specs:
try:
# This will raise a MatchNotFound error if the parser can't find its data
ps.parser(file_content, spr_namespace)
except MatchNotFoundError:
ps.parser(file_content, data_collector)
except MatchNotFoundError: # Raised if the parser can't find its data
if ps.required:
raise
else: # Parser didn't find anything, but it wasn't required
pass

return SinglePointResults(**spr_namespace.dict())
return SinglePointResults(**data_collector.dict())


@functools.wraps(parse)
Expand All @@ -81,3 +81,24 @@ def parse_results(*args, **kwargs):
stacklevel=2,
)
return parse(*args, **kwargs)


def encode(inp_data: ProgramInput, program: str) -> NativeInput:
"""Encode a ProgramInput object to a NativeInput object.
Args:
inp_data: The ProgramInput object to encode.
program: The program for which to encode the input.
Returns:
A NativeInput object with the encoded input.
Raises:
EncoderError: If the calctype is not supported by the program's encoder.
"""
# Check that calctype is supported by the encoder
encoder = import_module(f"qcparse.encoders.{program}")
if inp_data.calctype not in encoder.SUPPORTED_CALCTYPES:
raise EncoderError(f"Calctype '{inp_data.calctype}' not supported by encoder.")

return encoder.encode(inp_data)
34 changes: 33 additions & 1 deletion qcparse/models.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
"""Simple data models to support parsing of QM program output files."""

from collections import defaultdict
from enum import Enum
from types import SimpleNamespace
from typing import Callable, Dict, List, Optional

from pydantic import BaseModel
from pydantic import BaseModel, model_validator
from qcio import CalcType

from .exceptions import RegistryError
Expand Down Expand Up @@ -150,3 +151,34 @@ def single_point_results_namespace() -> ParsedDataCollector:
output_obj.extras = ParsedDataCollector()

return output_obj


class FileType(str, Enum):
"""Enum of supported TeraChem filetypes."""

stdout = "stdout"


class NativeInput(BaseModel):
"""Native input file data. Writing these files to disk should produce a valid input.
Attributes:
input: input file for the program
geometry: xyz file or other geometry file required for the calculation
geometry_filename: filename of the geometry file referenced in the input
"""

input_file: str
geometry_file: Optional[str] = None
geometry_filename: Optional[str] = None

@model_validator(mode="after")
def ensure_geometry_filename(self):
"""Ensure that geometry_filename is set if geometry is set."""
if self.geometry_file and not self.geometry_filename:
raise ValueError(
"geometry_filename must be set if geometry is set. "
"Set geometry_filename to the name of the geometry file as referenced "
"in the input file."
)
return self
21 changes: 8 additions & 13 deletions qcparse/parsers/terachem.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,15 @@
"""Parsers for TeraChem output files."""

import re
from enum import Enum

from qcio import CalcType

from qcparse.exceptions import MatchNotFoundError
from qcparse.models import ParsedDataCollector
from qcparse.models import FileType, ParsedDataCollector

from .utils import parser, regex_search


class FileType(str, Enum):
"""Enum of supported TeraChem filetypes."""

stdout = "stdout"
SUPPORTED_FILETYPES = {FileType.stdout}


def parse_calctype(string: str) -> CalcType:
Expand All @@ -31,7 +26,7 @@ def parse_calctype(string: str) -> CalcType:
raise MatchNotFoundError(regex, string)


@parser(filetype=FileType.stdout)
@parser()
def parse_energy(string: str, data_collector: ParsedDataCollector):
"""Parse the final energy from TeraChem stdout.
Expand All @@ -43,7 +38,7 @@ def parse_energy(string: str, data_collector: ParsedDataCollector):
data_collector.energy = float(regex_search(regex, string).group(1))


@parser(filetype=FileType.stdout, only=[CalcType.gradient, CalcType.hessian])
@parser(only=[CalcType.gradient, CalcType.hessian])
def parse_gradient(string: str, data_collector: ParsedDataCollector):
"""Parse gradient from TeraChem stdout."""
# This will match all floats after the dE/dX dE/dY dE/dZ header and stop at the
Expand All @@ -62,7 +57,7 @@ def parse_gradient(string: str, data_collector: ParsedDataCollector):
data_collector.gradient = gradient


@parser(filetype=FileType.stdout, only=[CalcType.hessian])
@parser(only=[CalcType.hessian])
def parse_hessian(string: str, data_collector: ParsedDataCollector):
"""Parse Hessian Matrix from TeraChem stdout
Expand Down Expand Up @@ -102,14 +97,14 @@ def parse_hessian(string: str, data_collector: ParsedDataCollector):
data_collector.hessian = hessian


@parser(filetype=FileType.stdout)
@parser()
def parse_natoms(string: str, data_collector: ParsedDataCollector):
"""Parse number of atoms value from TeraChem stdout"""
regex = r"Total atoms:\s*(\d+)"
data_collector.calcinfo_natoms = int(regex_search(regex, string).group(1))


@parser(filetype=FileType.stdout)
@parser()
def parse_nmo(string: str, data_collector: ParsedDataCollector):
"""Parse the number of molecular orbitals TeraChem stdout"""
regex = r"Total orbitals:\s*(\d+)"
Expand All @@ -136,7 +131,7 @@ def parse_version_string(string: str) -> str:
return f"{parse_terachem_version(string)} [{parse_git_commit(string)}]"


@parser(filetype=FileType.stdout)
@parser()
def parse_version(string: str, data_collector: ParsedDataCollector):
"""Parse TeraChem version from TeraChem stdout."""
data_collector.extras.program_version = parse_version_string(string)
Expand Down
Loading

0 comments on commit d049a77

Please sign in to comment.