Added encoders for TeraChem and the general encoder workflow. Added F…

…ileType.stdout as default filetype for parser decorator to reduce verbosity of parser code.
coltonbh · Sep 28, 2023 · d049a77 · d049a77
1 parent 7035cde
commit d049a77
Show file tree

Hide file tree

Showing 19 changed files with 292 additions and 72 deletions.
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -7,13 +7,15 @@
         "CUDA",
         "Hartree",
         "htmlcov",
+        "maxiter",
         "natom",
         "natoms",
         "nocuda",
         "pathconf",
         "qcel",
         "qcio",
         "qcparse",
+        "spinmult",
         "tcin",
         "tcout",
         "tcparse",

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,6 +10,14 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 
 - All input parsing details from the library.
 
+### Added
+
+- `encode` top level function and encoder for TeraChem input files.
+
+### Changed
+
+- Added `FileType.stdout` as default `filetype` argument to `parse` decorator to reduce boilerplate in parsers.
+
 ## [0.5.1] - 2023-09-19
 
 ### Changed

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -4,10 +4,10 @@ Hey there 👋! Look at you wanting to contribute! This package is designed to m
 
 ## TL;DR - How to add new parsers
 
-1. Create a file in the `parsers` directory named after the quantum chemistry program, e.g., `qchem.py`.
-2. Create `class FileType(str, Enum)` in the module registering the file types the parsers support.
+1. Create a file in the `parsers` directory named after the quantum chemistry program, e.g., `terachem.py`.
+2. Create a `SUPPORTED_FILETYPES` set in the module containing the file types the parsers support.
 3. If `stdout` is a file type then create a `def get_calctype(string: str) -> CalcType` function that returns the `CalcType` for the file. One of `CalcType.energy`, `CalcType.gradient`, or `CalcType.hessian`.
-4. Create simple parser functions that accept file data (`str|bytes`) and a `data_collector` object. The parser should 1) parse a single piece of data from the file, 2) cast it to the correct Python type and 3) set it on the output object at its corresponding location found on the `qcio.SinglePointResults` object. Register this parser by decorating it with the `@parser` decorator. The decorator must declare `filetype` and can optionally declare `required` (`True` by default), and `only` (`None` by default). See the `qcparse.utils.parser` decorator for details on what these mean.
+4. Create simple parser functions that accept file data (`str | bytes`) and a `data_collector` object. The parser should 1) parse a single piece of data from the file, 2) cast it to the correct Python type and 3) set it on the output object at its corresponding location found on the `qcio.SinglePointResults` object. Register this parser by decorating it with the `@parser()` decorator. The decorator optionally accepts a `filetype` argument (`FileType.stdout` by default) and can declare keyword arguments `required` (`True` by default), and `only` (`None` by default). See the `qcparse.utils.parser` decorator for details on what these mean.
 
    ```py
    @parser(filetype=FileType.stdout)

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -9,7 +9,7 @@ readme = "README.md"
 [tool.poetry.dependencies]
 python = "^3.8"
 pydantic = ">=2.0.0"
-qcio = ">=0.5.0"
+qcio = ">=0.7.1"
 
 [tool.poetry.group.dev.dependencies]
 mypy = "^1.1.1"

diff --git a/qcparse/__init__.py b/qcparse/__init__.py
@@ -4,7 +4,7 @@
 __version__ = metadata.version(__name__)
 
 
-from .main import parse, parse_results  # noqa: F401
+from .main import encode, parse, parse_results  # noqa: F401
 from .models import registry  # noqa: F401
 
-__all__ = ["parse", "parse_results", "registry"]
+__all__ = ["parse", "parse_results", "encode", "registry"]
diff --git a/qcparse/encoders/__init__.py b/qcparse/encoders/__init__.py
diff --git a/qcparse/encoders/terachem.py b/qcparse/encoders/terachem.py
@@ -0,0 +1,59 @@
+from qcio import CalcType, ProgramInput
+
+from qcparse.exceptions import EncoderError
+from qcparse.models import NativeInput
+
+SUPPORTED_CALCTYPES = {CalcType.energy, CalcType.gradient, CalcType.hessian}
+XYZ_FILENAME = "geometry.xyz"
+PADDING = 20  # padding between keyword and value in tc.in
+
+
+def encode(inp_obj: ProgramInput) -> NativeInput:
+    """Translate a ProgramInput into TeraChem inputs files.
+
+    Args:
+        inp_obj: The qcio ProgramInput object for a computation.
+
+    Returns:
+        NativeInput with .input being a tc.in file and .geometry an xyz file.
+    """
+
+    # calctype
+    if inp_obj.calctype.value == CalcType.hessian:
+        calctype = "frequencies"
+    else:
+        calctype = inp_obj.calctype.value
+
+    # Collect lines for input file
+    inp_lines = []
+    inp_lines.append(f"{'run':<{PADDING}} {calctype}")
+    # Molecule
+    inp_lines.append(f"{'coordinates':<{PADDING}} {XYZ_FILENAME}")
+    inp_lines.append(f"{'charge':<{PADDING}} {inp_obj.molecule.charge}")
+    inp_lines.append(f"{'spinmult':<{PADDING}} {inp_obj.molecule.multiplicity}")
+    # Model
+    inp_lines.append(f"{'method':<{PADDING}} {inp_obj.model.method}")
+    inp_lines.append(f"{'basis':<{PADDING}} {inp_obj.model.basis}")
+
+    # Keywords
+    non_keywords = {
+        "charge": ".molecule.charge",
+        "spinmult": ".molecule.multiplicity",
+        "run": ".calctype",
+        "basis": ".model.basis",
+        "method": ".model.method",
+    }
+    for key, value in inp_obj.keywords.items():
+        # Check for keywords that should be passed as structured data
+        if key in non_keywords:
+            raise EncoderError(
+                f"Keyword '{key}' should not be set as a keyword. It "
+                f"should be set at '{non_keywords[key]}'",
+            )
+        # Lowercase booleans
+        inp_lines.append(f"{key:<{PADDING}} {str(value).lower()}")
+    return NativeInput(
+        input_file="\n".join(inp_lines) + "\n",  # End file with newline
+        geometry_file=inp_obj.molecule.to_xyz(),
+        geometry_filename=XYZ_FILENAME,
+    )
diff --git a/qcparse/exceptions.py b/qcparse/exceptions.py
@@ -2,7 +2,11 @@ class BaseError(Exception):
     """Base tcparse exceptions"""
 
 
-class MatchNotFoundError(BaseError):
+class ParserError(BaseError):
+    """Base exception for parsers"""
+
+
+class MatchNotFoundError(ParserError):
     """Exception raised when a parsing match is not found"""
 
     def __init__(self, regex: str, string: str):
@@ -15,3 +19,7 @@ def __init__(self, regex: str, string: str):
 
 class RegistryError(BaseError):
     """Exception raised when a registry error occurs"""
+
+
+class EncoderError(BaseError):
+    """Exception raised when a encoder error occurs"""
diff --git a/qcparse/main.py b/qcparse/main.py
@@ -6,14 +6,14 @@
 from pathlib import Path
 from typing import List, Union
 
-from qcio import SinglePointResults
+from qcio import ProgramInput, SinglePointResults
 
-from .exceptions import MatchNotFoundError
-from .models import ParserSpec, registry, single_point_results_namespace
+from .exceptions import EncoderError, MatchNotFoundError, ParserError
+from .models import NativeInput, ParserSpec, registry, single_point_results_namespace
 from .parsers import *  # noqa: F403 Ensure all parsers get registered
 from .utils import get_file_contents
 
-__all__ = ["parse", "parse_results", "registry"]
+__all__ = ["parse", "parse_results", "encode", "registry"]
 
 
 def parse(
@@ -40,36 +40,36 @@ def parse(
         A SinglePointResults object containing the parsed data.
 
     Raises:
+        ParserError: If no parsers are registered for the filetype of the program.
         MatchNotFoundError: If a required parser fails to parse its data.
     """
-    file_content, _ = get_file_contents(data_or_path)
+    parsers = import_module(f"qcparse.parsers.{program}")
 
-    # Create a SinglePointResult namespace object to collect the parsed data
-    spr_namespace = single_point_results_namespace()
+    # Check that filetype is supported by the program's parsers
+    if filetype not in parsers.SUPPORTED_FILETYPES:
+        raise ParserError(f"filetype '{filetype}' not supported by {program} parsers.")
+
+    file_content = get_file_contents(data_or_path)
 
     # Get the calctype if filetype is 'stdout'
     if filetype == "stdout":
-        parse_calctype = import_module(f"qcparse.parsers.{program}").parse_calctype
-        calctype = parse_calctype(file_content)
+        calctype = parsers.parse_calctype(file_content)
 
-    else:
-        calctype = None
-
-    # Get all the parsers for the program and filetype
+    # Get all the parsers for the program, filetype, and calctype
     parser_specs: List[ParserSpec] = registry.get_parsers(program, filetype, calctype)
 
+    # Create a SinglePointResult namespace object to collect the parsed data
+    data_collector = single_point_results_namespace()
+
     # Apply parsers to the file content.
     for ps in parser_specs:
         try:
-            # This will raise a MatchNotFound error if the parser can't find its data
-            ps.parser(file_content, spr_namespace)
-        except MatchNotFoundError:
+            ps.parser(file_content, data_collector)
+        except MatchNotFoundError:  # Raised if the parser can't find its data
             if ps.required:
                 raise
-            else:  # Parser didn't find anything, but it wasn't required
-                pass
 
-    return SinglePointResults(**spr_namespace.dict())
+    return SinglePointResults(**data_collector.dict())
 
 
 @functools.wraps(parse)
@@ -81,3 +81,24 @@ def parse_results(*args, **kwargs):
         stacklevel=2,
     )
     return parse(*args, **kwargs)
+
+
+def encode(inp_data: ProgramInput, program: str) -> NativeInput:
+    """Encode a ProgramInput object to a NativeInput object.
+
+    Args:
+        inp_data: The ProgramInput object to encode.
+        program: The program for which to encode the input.
+
+    Returns:
+        A NativeInput object with the encoded input.
+
+    Raises:
+        EncoderError: If the calctype is not supported by the program's encoder.
+    """
+    # Check that calctype is supported by the encoder
+    encoder = import_module(f"qcparse.encoders.{program}")
+    if inp_data.calctype not in encoder.SUPPORTED_CALCTYPES:
+        raise EncoderError(f"Calctype '{inp_data.calctype}' not supported by encoder.")
+
+    return encoder.encode(inp_data)
diff --git a/qcparse/models.py b/qcparse/models.py
@@ -1,10 +1,11 @@
 """Simple data models to support parsing of QM program output files."""
 
 from collections import defaultdict
+from enum import Enum
 from types import SimpleNamespace
 from typing import Callable, Dict, List, Optional
 
-from pydantic import BaseModel
+from pydantic import BaseModel, model_validator
 from qcio import CalcType
 
 from .exceptions import RegistryError
@@ -150,3 +151,34 @@ def single_point_results_namespace() -> ParsedDataCollector:
     output_obj.extras = ParsedDataCollector()
 
     return output_obj
+
+
+class FileType(str, Enum):
+    """Enum of supported TeraChem filetypes."""
+
+    stdout = "stdout"
+
+
+class NativeInput(BaseModel):
+    """Native input file data. Writing these files to disk should produce a valid input.
+
+    Attributes:
+        input: input file for the program
+        geometry: xyz file or other geometry file required for the calculation
+        geometry_filename: filename of the geometry file referenced in the input
+    """
+
+    input_file: str
+    geometry_file: Optional[str] = None
+    geometry_filename: Optional[str] = None
+
+    @model_validator(mode="after")
+    def ensure_geometry_filename(self):
+        """Ensure that geometry_filename is set if geometry is set."""
+        if self.geometry_file and not self.geometry_filename:
+            raise ValueError(
+                "geometry_filename must be set if geometry is set. "
+                "Set geometry_filename to the name of the geometry file as referenced "
+                "in the input file."
+            )
+        return self
diff --git a/qcparse/parsers/terachem.py b/qcparse/parsers/terachem.py
@@ -1,20 +1,15 @@
 """Parsers for TeraChem output files."""
 
 import re
-from enum import Enum
 
 from qcio import CalcType
 
 from qcparse.exceptions import MatchNotFoundError
-from qcparse.models import ParsedDataCollector
+from qcparse.models import FileType, ParsedDataCollector
 
 from .utils import parser, regex_search
 
-
-class FileType(str, Enum):
-    """Enum of supported TeraChem filetypes."""
-
-    stdout = "stdout"
+SUPPORTED_FILETYPES = {FileType.stdout}
 
 
 def parse_calctype(string: str) -> CalcType:
@@ -31,7 +26,7 @@ def parse_calctype(string: str) -> CalcType:
     raise MatchNotFoundError(regex, string)
 
 
-@parser(filetype=FileType.stdout)
+@parser()
 def parse_energy(string: str, data_collector: ParsedDataCollector):
     """Parse the final energy from TeraChem stdout.
 
@@ -43,7 +38,7 @@ def parse_energy(string: str, data_collector: ParsedDataCollector):
     data_collector.energy = float(regex_search(regex, string).group(1))
 
 
-@parser(filetype=FileType.stdout, only=[CalcType.gradient, CalcType.hessian])
+@parser(only=[CalcType.gradient, CalcType.hessian])
 def parse_gradient(string: str, data_collector: ParsedDataCollector):
     """Parse gradient from TeraChem stdout."""
     # This will match all floats after the dE/dX dE/dY dE/dZ header and stop at the
@@ -62,7 +57,7 @@ def parse_gradient(string: str, data_collector: ParsedDataCollector):
     data_collector.gradient = gradient
 
 
-@parser(filetype=FileType.stdout, only=[CalcType.hessian])
+@parser(only=[CalcType.hessian])
 def parse_hessian(string: str, data_collector: ParsedDataCollector):
     """Parse Hessian Matrix from TeraChem stdout
 
@@ -102,14 +97,14 @@ def parse_hessian(string: str, data_collector: ParsedDataCollector):
     data_collector.hessian = hessian
 
 
-@parser(filetype=FileType.stdout)
+@parser()
 def parse_natoms(string: str, data_collector: ParsedDataCollector):
     """Parse number of atoms value from TeraChem stdout"""
     regex = r"Total atoms:\s*(\d+)"
     data_collector.calcinfo_natoms = int(regex_search(regex, string).group(1))
 
 
-@parser(filetype=FileType.stdout)
+@parser()
 def parse_nmo(string: str, data_collector: ParsedDataCollector):
     """Parse the number of molecular orbitals TeraChem stdout"""
     regex = r"Total orbitals:\s*(\d+)"
@@ -136,7 +131,7 @@ def parse_version_string(string: str) -> str:
     return f"{parse_terachem_version(string)} [{parse_git_commit(string)}]"
 
 
-@parser(filetype=FileType.stdout)
+@parser()
 def parse_version(string: str, data_collector: ParsedDataCollector):
     """Parse TeraChem version from TeraChem stdout."""
     data_collector.extras.program_version = parse_version_string(string)