From f7586453c42199297aa71e53e1872a8a4566b681 Mon Sep 17 00:00:00 2001 From: tigranfah Date: Thu, 1 Aug 2024 13:09:08 +0400 Subject: [PATCH 1/2] add links to evaluation repo --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 8810e77..48e8542 100644 --- a/README.md +++ b/README.md @@ -46,7 +46,7 @@ Instructions coming soon... ### Molecular Optimization 🎯 Running the Optimization Algorithm requires two steps: -**Step 1.** Define the Oracle, which is responsible to evaluate the oracle score for the given molecules. Below is presented the Oracle implementation scheme. +**Step 1.** Define the Oracle, which is responsible to evaluate the oracle scores for the given molecules. Below is presented the Oracle implementation scheme. ```python class ExampleOracle: @@ -97,7 +97,7 @@ rej_sample_config: ... fine tuning hyperparameters ... ``` -Calling the **optimize** function. +Call the **optimize** function. ```python from chemlactica.mol_opt.optimization import optimize @@ -119,7 +119,7 @@ optimize( ) ``` -Refer to [example_run.py](https://github.com/YerevaNN/ChemLactica/blob/main/chemlactica/mol_opt/example_run.py) for a full working example of an optimization run. For more complex examples refer to the [ChemlacticaTestSuit]() repository [mol_opt]() and [retmol]() directories. +Refer to [example_run.py](https://github.com/YerevaNN/ChemLactica/blob/main/chemlactica/mol_opt/example_run.py) for a full working example of an optimization run. For more complex examples refer to the [ChemlacticaTestSuit]() repository [mol_opt](https://github.com/YerevaNN/ChemLacticaTestSuite/tree/master/mol_opt) and [retmol](https://github.com/YerevaNN/ChemLacticaTestSuite/tree/master/retmol) directories. ## Tests The test for running the a small sized model with the same From 0f75cc088961b75b521af0a8590f2132f843b1fa Mon Sep 17 00:00:00 2001 From: tigranfah Date: Wed, 14 Aug 2024 16:48:28 +0400 Subject: [PATCH 2/2] add validating function for smiles --- .../mol_opt/chemlactica_125m_hparams.yaml | 4 ++-- chemlactica/mol_opt/example_run.py | 20 +++++++++---------- chemlactica/mol_opt/optimization.py | 7 +++++-- 3 files changed, 16 insertions(+), 15 deletions(-) diff --git a/chemlactica/mol_opt/chemlactica_125m_hparams.yaml b/chemlactica/mol_opt/chemlactica_125m_hparams.yaml index c08cccb..b990672 100644 --- a/chemlactica/mol_opt/chemlactica_125m_hparams.yaml +++ b/chemlactica/mol_opt/chemlactica_125m_hparams.yaml @@ -1,5 +1,5 @@ -checkpoint_path: /path/to/model_dir -tokenizer_path: /path/to/tokenizer_dir +checkpoint_path: yerevann/chemlactica-125m +tokenizer_path: yerevann/chemlactica-125m pool_size: 10 validation_perc: 0.2 num_mols: 0 diff --git a/chemlactica/mol_opt/example_run.py b/chemlactica/mol_opt/example_run.py index fc49466..f31e35e 100644 --- a/chemlactica/mol_opt/example_run.py +++ b/chemlactica/mol_opt/example_run.py @@ -22,7 +22,7 @@ def __init__(self, max_oracle_calls: int): self.mol_buffer = {} # the maximum possible oracle score or an upper bound - self.max_possible_oracle_score = 1.0 + self.max_possible_oracle_score = 800 # if True the __call__ function takes list of MoleculeEntry objects # if False (or unspecified) the __call__ function takes list of SMILES strings @@ -39,16 +39,14 @@ def __call__(self, molecules: List[MoleculeEntry]): else: try: tpsa = rdMolDescriptors.CalcTPSA(molecule.mol) - tpsa_score = min(tpsa / 1000, 1) + oracle_score = tpsa weight = rdMolDescriptors.CalcExactMolWt(molecule.mol) - if weight <= 349: - weight_score = 1 - elif weight >= 500: - weight_score = 0 - else: - weight_score = -0.00662 * weight + 3.31125 - - oracle_score = (tpsa_score + weight_score) / 3 + num_rings = rdMolDescriptors.CalcNumRings(molecule.mol) + if weight >= 350: + oracle_score = 0 + if num_rings < 2: + oracle_score = 0 + except Exception as e: print(e) oracle_score = 0 @@ -105,7 +103,7 @@ def parse_arguments(): for i in range(args.n_runs): set_seed(seeds[i]) oracle = TPSA_Weight_Oracle(max_oracle_calls=1000) - config["log_dir"] = os.path.join(args.output_dir, "results_tpsa+weight+num_rungs.log") + config["log_dir"] = os.path.join(args.output_dir, f"results_chemlactica_tpsa+weight+num_rungs_{seeds[i]}.log") config["max_possible_oracle_score"] = oracle.max_possible_oracle_score optimize( model, tokenizer, diff --git a/chemlactica/mol_opt/optimization.py b/chemlactica/mol_opt/optimization.py index 9cb2897..fb59d1b 100644 --- a/chemlactica/mol_opt/optimization.py +++ b/chemlactica/mol_opt/optimization.py @@ -36,13 +36,15 @@ def create_optimization_entries(num_entries, pool, config): return optim_entries -def create_molecule_entry(output_text): +def create_molecule_entry(output_text, validate_smiles): start_smiles_tag, end_smiles_tag = "[START_SMILES]", "[END_SMILES]" start_ind = output_text.rfind(start_smiles_tag) end_ind = output_text.rfind(end_smiles_tag) if start_ind == -1 or end_ind == -1: return None generated_smiles = output_text[start_ind+len(start_smiles_tag):end_ind] + if not validate_smiles(generated_smiles): + return None if len(generated_smiles) == 0: return None @@ -58,7 +60,8 @@ def create_molecule_entry(output_text): def optimize( model, tokenizer, oracle, config, - additional_properties={} + additional_properties={}, + validate_smiles=lambda x:True ): file = open(config["log_dir"], "w") print("config", config)