Skip to content

Commit

Permalink
Canonicalize SMILES code (#507)
Browse files Browse the repository at this point in the history
Use RdKit to canonicalize the SMILES code given from the user.
When the canonical SMILES differs from the input, print it.
Apparently, some SMILES failed to generate conformer when they were not canonicalized. 
Closes #505 and #331
  • Loading branch information
danielhollas committed Sep 6, 2023
1 parent 79a234d commit c8c234e
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 2 deletions.
33 changes: 31 additions & 2 deletions aiidalab_widgets_base/structures.py
Original file line number Diff line number Diff line change
Expand Up @@ -772,7 +772,10 @@ def _rdkit_opt(self, smiles, steps):
return None
mol = Chem.AddHs(mol)

AllChem.EmbedMolecule(mol, maxAttempts=20, randomSeed=42)
conf_id = AllChem.EmbedMolecule(mol, maxAttempts=20, randomSeed=42)
if conf_id < 0:
self.output.value = "RDKit ERROR: Could not generate conformer"
return None
if AllChem.UFFHasAllMoleculeParams(mol):
AllChem.UFFOptimizeMolecule(mol, maxIters=steps)
else:
Expand All @@ -785,8 +788,18 @@ def _rdkit_opt(self, smiles, steps):

def _mol_from_smiles(self, smiles, steps=1000):
"""Convert SMILES to ase structure try rdkit then pybel"""

# Canonicalize the SMILES code
# https://en.wikipedia.org/wiki/Simplified_molecular-input_line-entry_system#Terminology
canonical_smiles = self.canonicalize_smiles(smiles)
if not canonical_smiles:
return None

if canonical_smiles != smiles:
self.output.value = f"Canonical SMILES: {canonical_smiles}"

try:
return self._rdkit_opt(smiles, steps)
return self._rdkit_opt(canonical_smiles, steps)
except ValueError as e:
self.output.value = str(e)
if self.disable_openbabel:
Expand All @@ -802,11 +815,27 @@ def _on_button_pressed(self, change=None):
return
spinner = f"Screening possible conformers {self.SPINNER}" # font-size:20em;
self.output.value = spinner

self.structure = self._mol_from_smiles(self.smiles.value)
# Don't overwrite possible error/warning messages
if self.output.value == spinner:
self.output.value = ""

def canonicalize_smiles(self, smiles):
from rdkit import Chem

mol = Chem.MolFromSmiles(smiles, sanitize=True)
if mol is None:
# Something is seriously wrong with the SMILES code,
# just return None and don't attempt anything else.
self.output.value = "RDkit ERROR: Invalid SMILES string"
return None
canonical_smiles = Chem.MolToSmiles(mol, isomericSmiles=True, canonical=True)
if not canonical_smiles:
self.output.value = "RDkit ERROR: Could not canonicalize SMILES"
return None
return canonical_smiles

@tl.default("structure")
def _default_structure(self):
return None
Expand Down
21 changes: 21 additions & 0 deletions tests/test_structures.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,27 @@ def test_smiles_widget():
assert widget.structure.get_chemical_formula() == "N2"


@pytest.mark.usefixtures("aiida_profile_clean")
def test_smiles_canonicalization():
"""Test the SMILES canonicalization via RdKit."""
widget = awb.SmilesWidget()

# Should not change canonical smiles
assert widget.canonicalize_smiles("C") == "C"

# Should canonicalize this
canonical = widget.canonicalize_smiles("O=CC=C")
assert canonical == "C=CC=O"

# Should be idempotent
assert canonical == widget.canonicalize_smiles(canonical)

# Regression test for https://github.com/aiidalab/aiidalab-widgets-base/issues/505
# Throwing in this non-canonical string should not raise
nasty_smiles = "C=CC1=C(C2=CC=C(C3=CC=CC=C3)C=C2)C=C(C=C)C(C4=CC=C(C(C=C5)=CC=C5C(C=C6C=C)=C(C=C)C=C6C7=CC=C(C(C=C8)=CC=C8C(C=C9C=C)=C(C=C)C=C9C%10=CC=CC=C%10)C=C7)C=C4)=C1"
widget._rdkit_opt(nasty_smiles, steps=1)


@pytest.mark.usefixtures("aiida_profile_clean")
def test_basic_cell_editor_widget(structure_data_object):
"""Test the `BasicCellEditor`."""
Expand Down

0 comments on commit c8c234e

Please sign in to comment.