From ed218476c67984ef19bb396c92b0d271e455087b Mon Sep 17 00:00:00 2001 From: Patrick Armstrong Date: Tue, 18 Jun 2024 08:36:10 -0500 Subject: [PATCH 1/2] Scone Refactor --- pippin/classifiers/classifier.py | 4 +- pippin/classifiers/scone.py | 378 ++++++++++---------- pippin/classifiers/scone_legacy.py | 297 +++++++++++++++ tests/config_files/cfg_dev.yml | 5 + tests/config_files/valid_classify_scone.yml | 48 +++ tests/test_valid_config.py | 54 +++ 6 files changed, 600 insertions(+), 186 deletions(-) create mode 100644 pippin/classifiers/scone_legacy.py create mode 100644 tests/config_files/valid_classify_scone.yml diff --git a/pippin/classifiers/classifier.py b/pippin/classifiers/classifier.py index f4498b40..d06206c3 100644 --- a/pippin/classifiers/classifier.py +++ b/pippin/classifiers/classifier.py @@ -363,7 +363,7 @@ def get_num_ranseed(sim_tasks, lcfit_tasks): # deps.append(t) extra = t.get_unique_name() - assert t.__class__ == cls, f"Model {clas_name} with class {cls} has model {model} with class {t.__class__}, they should match!" + assert isinstance(t, cls), f"Model {clas_name} with class {cls} has model {model} with class {t.__class__}, they should match!" indexes = get_num_ranseed(sim_deps, fit_deps) for i in range(indexes): @@ -381,8 +381,6 @@ def get_num_ranseed(sim_tasks, lcfit_tasks): for i in range(indexes): num = i + 1 if indexes > 1 else None clas_output_dir = _get_clas_output_dir(base_output_dir, stage_number, sim_name, fit_name, clas_name, index=num) - print(clas_output_dir) - print(deps) cc = cls(clas_name, clas_output_dir, config, deps, mode, options, index=i) Task.logger.info( f"Creating classification task {name} with {cc.num_jobs} jobs, for LC fit {fit_name} on simulation {sim_name} and index {i}" diff --git a/pippin/classifiers/scone.py b/pippin/classifiers/scone.py index 23c95b82..324ee99a 100644 --- a/pippin/classifiers/scone.py +++ b/pippin/classifiers/scone.py @@ -1,16 +1,28 @@ -import shutil -import subprocess +# Created Mar 2024 by R.Kessler and H.Qu +# Refactor pippin interface to scone to accept and modify +# a scone-input file. + +import os, sys, shutil, subprocess, yaml, re, time from pathlib import Path -import yaml import pandas as pd -import re import numpy as np -import time from pippin.classifiers.classifier import Classifier from pippin.config import get_config, get_output_loc, mkdirs, get_data_loc, merge_dict from pippin.task import Task + +# ========================================= + +SCONE_SHELL_SCRIPT = "run_refactor.py" # top-level script under $SCONE_DIR + +KEYLIST_SCONE_INPUT = [ 'init_env_train', 'init_env_heatmaps', + 'prescale_heatmaps', 'nevt_select_heatmaps', + 'batch_size', 'categorical', 'class_balanced', + 'num_epochs', 'num_mjd_bins', 'num_wavelength_bins', + 'mode', 'trained_model', 'prob_column_name' ] + +# ========================================== class SconeClassifier(Classifier): """ convolutional neural network-based SN photometric classifier for details, see https://arxiv.org/abs/2106.04370, https://arxiv.org/abs/2111.05539, https://arxiv.org/abs/2207.09440 @@ -33,7 +45,8 @@ class SconeClassifier(Classifier): NUM_EPOCHS: 400 IA_FRACTION: 0.5 MODEL: /path/to/trained/model - SIM_FRACTION: 1 # fraction of sims to use for training + SIM_FRACTION: 1 # fraction of sims to use for training (to be obsolete) + PRESCALE_HEATMAPS: 1 # divide sample by PRESCALE for heatmag and training SCONE_CPU_BATCH_FILE: /path/to/sbatch/template/for/scone SCONE_GPU_BATCH_FILE: /path/to/sbatch/template/for/scone BATCH_REPLACE: {} @@ -45,99 +58,57 @@ class SconeClassifier(Classifier): """ + def __new__(cls, name, output_dir, config, dependencies, mode, options, index=0, model_name=None): + # XXX DEPRECATION + # If no BASE file is present, run legacy version of Scone + # Avoid recursive nonsense by making sure the type of `cls` is SconeClassifier + if cls == SconeClassifier and config.get('BASE') is None: + # Have to import later because SconeClassifier must exist prior to importing SconeLegacyClassifier + from pippin.classifiers.scone_legacy import SconeLegacyClassifier + cls = SconeLegacyClassifier + return super().__new__(cls) + def __init__(self, name, output_dir, config, dependencies, mode, options, index=0, model_name=None): super().__init__(name, output_dir, config, dependencies, mode, options, index=index, model_name=model_name) self.global_config = get_config() self.options = options - self.gpu = self.options.get("GPU", False) + # - - - - - - - + # special checks to help users cope with some changes + if mode == 'predict' and 'MODEL' in options: + self.options['TRAINED_MODEL'] = self.options['MODEL'] + + self.gpu = self.options.get("GPU", False) self.init_env_heatmaps = self.global_config["SCONE"]["init_env_cpu"] - self.init_env = self.global_config["SCONE"]["init_env_cpu"] if not self.gpu else self.global_config["SCONE"]["init_env_gpu"] + self.init_env = self.global_config["SCONE"]["init_env_cpu"] if not self.gpu else self.global_config["SCONE"]["init_env_gpu"] self.path_to_classifier = self.global_config["SCONE"]["location"] + self.combine_mask = "COMBINE_MASK" in config - output_path_obj = Path(self.output_dir) + self.select_lcfit = self.options.get("OPTIONAL_MASK_FIT", None) # RK May 3 2024 + scone_input_file = config.get('BASE') # refactor by passing scone input file to pippin + if scone_input_file is not None: + scone_input_file = get_data_loc(scone_input_file) + self.scone_input_file = scone_input_file + + output_path_obj = Path(self.output_dir) heatmaps_path_obj = output_path_obj / "heatmaps" + self.job_base_name = output_path_obj.parents[1].name + "__" + output_path_obj.name - self.batch_replace = self.options.get("BATCH_REPLACE", self.global_config.get("BATCH_REPLACE", {})) - self.slurm = """{sbatch_header} - {task_setup}""" + self.batch_replace = self.options.get("BATCH_REPLACE", + self.global_config.get("BATCH_REPLACE", {})) - self.config_path = str(output_path_obj / "model_config.yml") - self.logfile = str(output_path_obj / "output.log") - self.model_sbatch_job_path = str(output_path_obj / "job.slurm") - - self.heatmaps_path = str(heatmaps_path_obj) self.heatmaps_done_file = str(heatmaps_path_obj / "done.txt") - self.heatmaps_sbatch_header_path = str(heatmaps_path_obj / "sbatch_header.sh") - self.heatmaps_log_path = str(heatmaps_path_obj / f"create_heatmaps__{Path(self.config_path).name.split('.')[0]}.log") remake_heatmaps = self.options.get("REMAKE_HEATMAPS", False) self.keep_heatmaps = not remake_heatmaps - def make_sbatch_header(self, option_name, header_dict, use_gpu=False): - sbatch_header_template = self.options.get(option_name) - sbatch_header = self.sbatch_gpu_header if use_gpu else self.sbatch_cpu_header - - if sbatch_header_template is not None: - self.logger.debug(f"batch file found at {sbatch_header_template}") - with open(get_data_loc(sbatch_header_template), 'r') as f: - sbatch_header = f.read() - - sbatch_header = self.clean_header(sbatch_header) - - header_dict = merge_dict(header_dict, self.batch_replace) - return self._update_header(sbatch_header, header_dict) - - def make_heatmaps_sbatch_header(self): - self.logger.info("heatmaps not created, creating now") - shutil.rmtree(self.output_dir, ignore_errors=True) - mkdirs(self.heatmaps_path) - - # TODO: if externally specified batchfile exists, have to parse desired logfile path from it - header_dict = { - "REPLACE_LOGFILE": self.heatmaps_log_path, - "REPLACE_WALLTIME": "12:00:00", #TODO: change to scale with # of heatmaps expected - "REPLACE_MEM": self.options.get("HEATMAPS_MEM", "32GB"), - } - heatmaps_sbatch_header = self.make_sbatch_header("HEATMAPS_BATCH_FILE", header_dict) - - with open(self.heatmaps_sbatch_header_path, "w+") as f: - f.write(heatmaps_sbatch_header) - - def make_model_sbatch_script(self): - header_dict = { - "REPLACE_NAME": self.job_base_name, - "REPLACE_LOGFILE": str(Path(self.output_dir) / "output.log"), - "REPLACE_MEM": self.options.get("MODEL_MEM", "64GB"), - "REPLACE_WALLTIME": "4:00:00" if self.gpu else "12:00:00", # 4h is max for gpu - } - model_sbatch_header = self.make_sbatch_header("MODEL_BATCH_FILE", header_dict, use_gpu=self.gpu) - - setup_dict = { - "init_env": self.init_env, - "path_to_classifier": self.path_to_classifier, - "heatmaps_path": self.heatmaps_path, - "config_path": self.config_path, - "done_file": self.done_file, - } - - format_dict = { - "sbatch_header": model_sbatch_header, - "task_setup": self.update_setup(setup_dict, self.task_setup['scone']) - } - - self.logger.info(f"Running SCONE model, slurm job written to {self.model_sbatch_job_path}") - slurm_script = self.slurm.format(**format_dict) - - with open(self.model_sbatch_job_path, "w") as f: - f.write(slurm_script) - - return slurm_script + return def classify(self, mode): + self.logger.info(f"============ Prepare refactored SCONE with mode = {mode} =============") failed = False if Path(self.done_file).exists(): self.logger.debug(f"Found done file at {self.done_file}") @@ -145,119 +116,182 @@ def classify(self, mode): if "SUCCESS" not in f.read().upper(): failed = True - heatmaps_created = self._heatmap_creation_success() and self.keep_heatmaps + scone_input_file = self.scone_input_file + # - - - - sim_deps = self.get_simulation_dependency() sim_dirs = [sim_dep.output["photometry_dirs"][self.index] for sim_dep in sim_deps] - lcdata_paths = [path for path in self._get_lcdata_paths(sim_dirs) if "PHOT.FITS" in path] - metadata_paths = [path.replace("PHOT.FITS", "HEAD.FITS") for path in lcdata_paths] + # prepare scone input lines needed to create hash, + # but don't create scone input file yet. + scone_input_lines = self.prepare_scone_input_lines(sim_dirs,mode) - str_config = self._make_config(metadata_paths, lcdata_paths, mode, heatmaps_created) - new_hash = self.get_hash_from_string(str_config) + str_config = ' '.join(scone_input_lines) + new_hash = self.get_hash_from_string(str_config) if self._check_regenerate(new_hash) or failed: - self.logger.debug("Regenerating") + self.logger.debug("Regenerating scone") else: - self.logger.info("Hash check passed, not rerunning") + self.logger.info("scone hash check passed, not rerunning") self.should_be_done() return True - if not heatmaps_created: - # this deletes the whole directory tree, don't write anything before this - self.make_heatmaps_sbatch_header() - + # later, perhaps check to preserve heatmaps ?? + if os.path.exists(self.output_dir): + shutil.rmtree(self.output_dir) + os.makedirs(self.output_dir) + + # write scone input file, and beware that name of scone + # input file is updated + scone_input_base = os.path.basename(self.scone_input_file) + self.scone_input_file = self.output_dir + '/' + 'PIP_' + scone_input_base + with open(self.scone_input_file,"wt") as i: + for line in scone_input_lines: + i.write(f"{line}\n") + self.save_new_hash(new_hash) - with open(self.config_path, "w+") as cfgfile: - cfgfile.write(str_config) - - slurm_script = self.make_model_sbatch_script() - - # TODO: nersc needs `module load esslurm` to sbatch gpu jobs, maybe make - # this shell command to a file so diff systems can define their own - file_to_run = 'run.py' - if self.options.get("REFACTORED", False): - file_to_run = 'run_refactor.py' - elif self.options.get("LEGACY", False): - file_to_run = 'run_legacy.py' - path = Path(self.path_to_classifier) / file_to_run - path = path if path.exists() else Path(self.path_to_classifier) / 'run.py' - cmd = f"python {str(path)} --config_path {self.config_path}" - subprocess.run([cmd], shell=True) + + path = Path(self.path_to_classifier) / SCONE_SHELL_SCRIPT + path = path if path.exists() else Path(self.path_to_classifier) / SCONE_SHELL_SCRIPT + cmd = f"python {str(path)} " \ + f"--config_path {self.scone_input_file} " + # f"--sbatch_job_name {self.job_base_name} " + self.logger.info(f"Running command: {cmd}") + subprocess.run([cmd], shell=True) return True + def prepare_scone_input_lines(self, sim_dirs, mode ): + + # Created Apr 2024 by R.Kessler + # Read base scone input and make a few modification such as + # the sim data dirs, and other substitutions defined in pippin input. + # Method returns list of lines for modified scone-config input file. + # Original comments and input layout are preserved. + + config_lines = [] + scone_input_file = self.scone_input_file + options_local = self.options.copy() # make local copy + + # set local mode as if it were an override key in pippin input file + options_local['MODE'] = mode + + if mode == 'predict' : + options_local['PROB_COLUMN_NAME'] = self.get_prob_column_name() + + # - - - - + flag_remove_line = False + + with open(scone_input_file, 'r') as i: + inp_config = i.read().split('\n') + + key_replace_dict = {} + key_remove_list = [ 'input_data_paths:' , 'snid_select_files:', + 'sbatch_job_name:' ] + + for line_in in inp_config: + line_out = line_in + wdlist = line_in.split() + nwd = len(wdlist) + if nwd == 0 : + flag_remove_line = False + else: + if wdlist[0] == 'output_path:' : + line_out = line_in.replace(wdlist[1],self.output_dir) + + # goofy logic to remove original input_data_paths + if flag_remove_line and wdlist[0] != '-' : + flag_remove_line = False + if wdlist[0] in key_remove_list: + flag_remove_line = True + + # check all possible scone keys that can be overwritten/added + for key in KEYLIST_SCONE_INPUT: + if wdlist[0] == key + ':' : + key_pippin = key.upper() + if key_pippin in options_local: + key_replace_dict[key_pippin] = True + val_replace = options_local[key_pippin] + line_out = line_in.replace(wdlist[1],str(val_replace)) + + # remove prescale for predict mode + if mode == 'predict' and 'prescale' in wdlist[0]: + line_out = f"# WARNING: {wdlist[0]} removed for {mode} mode." + + + if not flag_remove_line : + config_lines.append(line_out) + + # - - - - - - - - - - + # add extra info from pippin + config_lines.append(f"") + config_lines.append(f"# ======================================= ") + config_lines.append(f"# keys added by pippin\n ") + + # pass sbatch_job_name via config since there are other sbatch config + # keys already. Could also pass via command line arg --sbatch_job_name. + config_lines.append(f"sbatch_job_name: {self.job_base_name}\n") + + config_lines.append(f"input_data_paths:") + for sim_dir in sim_dirs: + resolved_dir = os.path.realpath(sim_dir) + config_lines.append(f" - {resolved_dir}") + + # add pippin-specified keys that were not in the original scone input + for key_pippin in options_local: + key = key_pippin.lower() + if key_pippin not in key_replace_dict and key in KEYLIST_SCONE_INPUT: + val = options_local[key_pippin] + line = f"{key}: {val}" + config_lines.append(f"") + config_lines.append(f"{line}") + + # check option to select events passing LCFIT + + if self.select_lcfit: + config_lines.append(f'') + config_lines.append(f'# Train on events passing LCFIT') + config_lines.append('snid_select_files:') + lcfit_deps = self.get_fit_dependency() + #self.logger.info(f"\n xxx lcfit_deps = \n{lcfit_deps}\n") + for tmp_dict in lcfit_deps: + fitres_dir = tmp_dict['fitres_dirs'][self.index] + fitopt_base_file = tmp_dict['fitopt_map']['DEFAULT'] + fitres_file = f"{fitres_dir}/{fitopt_base_file}" + config_lines.append(f" - {fitres_file}") + + return config_lines + + + #def get_optional_requirements(config): + # # Created May 3 2024 by R.Kessler and P.Armstrong + # if config.get("SELECT_LCFIT", False): + # return False, True # wait for LCFIT task + # return False, False # no optional LCFIT task + def predict(self): return self.classify("predict") def train(self): return self.classify("train") - def _get_types(self): - types = {} - for t in self.get_simulation_dependency(): - for k, v in t.output['types'].items(): - if k not in types: - types[k] = v - return types - - def _make_config(self, metadata_paths, lcdata_paths, mode, heatmaps_created): - config = {} - - # environment configuration - config["init_env_heatmaps"] = self.init_env_heatmaps - config["init_env"] = self.init_env - - # info for heatmap creation - if not heatmaps_created: - config["sbatch_header_path"] = self.heatmaps_sbatch_header_path - - config["heatmaps_donefile"] = self.heatmaps_done_file - config["heatmaps_logfile"] = self.heatmaps_log_path - config["sim_fraction"] = self.options.get("SIM_FRACTION", 1) # 1/sim_fraction % of simulated SNe will be used for the model - config["heatmaps_path"] = self.heatmaps_path - config["model_sbatch_job_path"] = self.model_sbatch_job_path - config["num_wavelength_bins"] = self.options.get("NUM_WAVELENGTH_BINS", 32) - config["num_mjd_bins"] = self.options.get("NUM_MJD_BINS", 180) - config["metadata_paths"] = metadata_paths - config["lcdata_paths"] = lcdata_paths - - # info for classification model - config["categorical"] = self.options.get("CATEGORICAL", False) - config["num_epochs"] = self.options.get("NUM_EPOCHS", 400) # TODO: replace num epochs with autostop: stop training when slope plateaus? - config["batch_size"] = self.options.get("BATCH_SIZE", 32) # TODO: replace with percentage of total size? - config["Ia_fraction"] = self.options.get("IA_FRACTION", 0.5) - config["output_path"] = self.output_dir - config["trained_model"] = self.options.get("MODEL", None) - config["kcor_file"] = self.options.get("KCOR_FILE", None) - config["mode"] = mode - config["job_base_name"] = self.job_base_name - config["class_balanced"] = (mode == "train") - - types = self._get_types() - if types is not None: - types = {int(k): v for k, v in types.items()} # sometimes the keys are strings, sometimes ints - self.logger.info(f"input types from sim found, types set to {types}") - config["sn_type_id_to_name"] = types - - return yaml.dump(config) - def _check_completion(self, squeue): if Path(self.done_file).exists(): - self.logger.debug(f"Found done file at {self.done_file}") + self.logger.debug(f"Found scone done file at {self.done_file}") with open(self.done_file) as f: if "SUCCESS" not in f.read().upper(): return Task.FINISHED_FAILURE pred_path = str(Path(self.output_dir) / "predictions.csv") - predictions = pd.read_csv(pred_path) - if "pred_labels" in predictions.columns: - predictions = predictions[["snid", "pred_labels"]] # make sure snid is the first col - predictions = predictions.rename(columns={"pred_labels": self.get_prob_column_name()}) - predictions.to_csv(pred_path, index=False) - self.logger.info(f"Predictions file can be found at {pred_path}") + #predictions = pd.read_csv(pred_path) + #if "pred_labels" in predictions.columns: + # predictions = predictions[["snid", "pred_labels"]] # make sure snid is the first col + # predictions = predictions.rename(columns={"pred_labels": self.get_prob_column_name()}) + # predictions.to_csv(pred_path, index=False) + #self.logger.info(f"Predictions file can be found at {pred_path}") self.output.update({"model_filename": self.options.get("MODEL", str(Path(self.output_dir) / "trained_model")), "predictions_filename": pred_path}) + return Task.FINISHED_SUCCESS return self.check_for_job(squeue, self.job_base_name) @@ -269,28 +303,6 @@ def _heatmap_creation_success(self): return False return Path(self.heatmaps_path).exists() and (Path(self.heatmaps_path) / "done.log").exists() - def num_jobs_in_queue(self): - print("rerun num jobs in queue") - squeue = [i.strip() for i in subprocess.check_output(f"squeue -h -u $USER -o '%.200j'", shell=True, text=True).splitlines()] - self.logger.debug(f"{squeue}") - return self.check_for_job(squeue, self.job_base_name) - - @staticmethod - def _get_lcdata_paths(sim_dirs): - lcdata_paths = [str(f.resolve()) for sim_dir in sim_dirs for f in Path(sim_dir).iterdir() if "PHOT" in f.name] - return lcdata_paths - - @staticmethod - def _update_header(header, header_dict): - for key, value in header_dict.items(): - if key in header: - header = header.replace(key, str(value)) - append_list = header_dict.get("APPEND") - if append_list is not None: - lines = header.split('\n') - lines += append_list - header = '\n'.join(lines) - return header @staticmethod def get_requirements(options): diff --git a/pippin/classifiers/scone_legacy.py b/pippin/classifiers/scone_legacy.py new file mode 100644 index 00000000..0a2fe8c2 --- /dev/null +++ b/pippin/classifiers/scone_legacy.py @@ -0,0 +1,297 @@ +import shutil +import subprocess +from pathlib import Path +import yaml +import pandas as pd +import re +import numpy as np +import time + +from pippin.classifiers.scone import SconeClassifier +from pippin.config import get_config, get_output_loc, mkdirs, get_data_loc, merge_dict +from pippin.task import Task + +class SconeLegacyClassifier(SconeClassifier): + """ convolutional neural network-based SN photometric classifier + for details, see https://arxiv.org/abs/2106.04370, https://arxiv.org/abs/2111.05539, https://arxiv.org/abs/2207.09440 + + CONFIGURATION: + ============== + CLASSIFICATION: + label: + CLASSIFIER: SconeClassifier + MASK: TEST # partial match on sim and classifier + MASK_SIM: TEST # partial match on sim name + MASK_FIT: TEST # partial match on lcfit name + MODE: train/predict + OPTS: + GPU: True + CATEGORICAL: False + NUM_WAVELENGTH_BINS: 32 + NUM_MJD_BINS: 180 + REMAKE_HEATMAPS: False + NUM_EPOCHS: 400 + IA_FRACTION: 0.5 + MODEL: /path/to/trained/model + SIM_FRACTION: 1 # fraction of sims to use for training + SCONE_CPU_BATCH_FILE: /path/to/sbatch/template/for/scone + SCONE_GPU_BATCH_FILE: /path/to/sbatch/template/for/scone + BATCH_REPLACE: {} + + OUTPUTS: + ======== + predictions.csv: list of snids and associated predictions + training_history.csv: training history output from keras + + """ + + def __init__(self, name, output_dir, config, dependencies, mode, options, index=0, model_name=None): + super().__init__(name, output_dir, config, dependencies, mode, options, index=index, model_name=model_name) + self.logger.warning(f'Using Legacy Scone version, pass a Scone input file via `BASE: /path/to/input.yml` to use the latest Scone version.') + self.global_config = get_config() + self.options = options + + self.gpu = self.options.get("GPU", False) + self.init_env_heatmaps = self.global_config["SCONE"]["init_env_cpu"] + self.init_env = self.global_config["SCONE"]["init_env_cpu"] if not self.gpu else self.global_config["SCONE"]["init_env_gpu"] + self.path_to_classifier = self.global_config["SCONE"]["location"] + self.combine_mask = "COMBINE_MASK" in config + + output_path_obj = Path(self.output_dir) + heatmaps_path_obj = output_path_obj / "heatmaps" + + self.job_base_name = output_path_obj.parents[1].name + "__" + output_path_obj.name + + self.batch_replace = self.options.get("BATCH_REPLACE", self.global_config.get("BATCH_REPLACE", {})) + self.slurm = """{sbatch_header} + {task_setup}""" + + self.config_path = str(output_path_obj / "model_config.yml") + self.logfile = str(output_path_obj / "output.log") + self.model_sbatch_job_path = str(output_path_obj / "job.slurm") + + self.heatmaps_path = str(heatmaps_path_obj) + self.heatmaps_done_file = str(heatmaps_path_obj / "done.txt") + self.heatmaps_sbatch_header_path = str(heatmaps_path_obj / "sbatch_header.sh") + self.heatmaps_log_path = str(heatmaps_path_obj / f"create_heatmaps__{Path(self.config_path).name.split('.')[0]}.log") + + remake_heatmaps = self.options.get("REMAKE_HEATMAPS", False) + self.keep_heatmaps = not remake_heatmaps + + def make_sbatch_header(self, option_name, header_dict, use_gpu=False): + sbatch_header_template = self.options.get(option_name) + sbatch_header = self.sbatch_gpu_header if use_gpu else self.sbatch_cpu_header + + if sbatch_header_template is not None: + self.logger.debug(f"batch file found at {sbatch_header_template}") + with open(get_data_loc(sbatch_header_template), 'r') as f: + sbatch_header = f.read() + + sbatch_header = self.clean_header(sbatch_header) + + header_dict = merge_dict(header_dict, self.batch_replace) + return self._update_header(sbatch_header, header_dict) + + def make_heatmaps_sbatch_header(self): + self.logger.info("heatmaps not created, creating now") + shutil.rmtree(self.output_dir, ignore_errors=True) + mkdirs(self.heatmaps_path) + + # TODO: if externally specified batchfile exists, have to parse desired logfile path from it + header_dict = { + "REPLACE_LOGFILE": self.heatmaps_log_path, + "REPLACE_WALLTIME": "12:00:00", #TODO: change to scale with # of heatmaps expected + "REPLACE_MEM": self.options.get("HEATMAPS_MEM", "32GB"), + } + heatmaps_sbatch_header = self.make_sbatch_header("HEATMAPS_BATCH_FILE", header_dict) + + with open(self.heatmaps_sbatch_header_path, "w+") as f: + f.write(heatmaps_sbatch_header) + + def make_model_sbatch_script(self): + header_dict = { + "REPLACE_NAME": self.job_base_name, + "REPLACE_LOGFILE": str(Path(self.output_dir) / "output.log"), + "REPLACE_MEM": self.options.get("MODEL_MEM", "64GB"), + "REPLACE_WALLTIME": "4:00:00" if self.gpu else "12:00:00", # 4h is max for gpu + } + model_sbatch_header = self.make_sbatch_header("MODEL_BATCH_FILE", header_dict, use_gpu=self.gpu) + + setup_dict = { + "init_env": self.init_env, + "path_to_classifier": self.path_to_classifier, + "heatmaps_path": self.heatmaps_path, + "config_path": self.config_path, + "done_file": self.done_file, + } + + format_dict = { + "sbatch_header": model_sbatch_header, + "task_setup": self.update_setup(setup_dict, self.task_setup['scone']) + } + + self.logger.info(f"Running SCONE model, slurm job written to {self.model_sbatch_job_path}") + slurm_script = self.slurm.format(**format_dict) + + with open(self.model_sbatch_job_path, "w") as f: + f.write(slurm_script) + + return slurm_script + + def classify(self, mode): + failed = False + if Path(self.done_file).exists(): + self.logger.debug(f"Found done file at {self.done_file}") + with open(self.done_file) as f: + if "SUCCESS" not in f.read().upper(): + failed = True + + heatmaps_created = self._heatmap_creation_success() and self.keep_heatmaps + + sim_deps = self.get_simulation_dependency() + sim_dirs = [sim_dep.output["photometry_dirs"][self.index] for sim_dep in sim_deps] + + lcdata_paths = [path for path in self._get_lcdata_paths(sim_dirs) if "PHOT.FITS" in path] + metadata_paths = [path.replace("PHOT.FITS", "HEAD.FITS") for path in lcdata_paths] + + str_config = self._make_config(metadata_paths, lcdata_paths, mode, heatmaps_created) + new_hash = self.get_hash_from_string(str_config) + + if self._check_regenerate(new_hash) or failed: + self.logger.debug("Regenerating") + else: + self.logger.info("Hash check passed, not rerunning") + self.should_be_done() + return True + + if not heatmaps_created: + # this deletes the whole directory tree, don't write anything before this + self.make_heatmaps_sbatch_header() + + self.save_new_hash(new_hash) + with open(self.config_path, "w+") as cfgfile: + cfgfile.write(str_config) + + slurm_script = self.make_model_sbatch_script() + + # TODO: nersc needs `module load esslurm` to sbatch gpu jobs, maybe make + # this shell command to a file so diff systems can define their own + file_to_run = 'run.py' + if self.options.get("REFACTORED", False): + file_to_run = 'run_refactor.py' + elif self.options.get("LEGACY", False): + file_to_run = 'run_legacy.py' + path = Path(self.path_to_classifier) / file_to_run + path = path if path.exists() else Path(self.path_to_classifier) / 'run.py' + cmd = f"python {str(path)} --config_path {self.config_path}" + subprocess.run([cmd], shell=True) + self.logger.info(f"Running command: {cmd}") + + return True + + def predict(self): + return self.classify("predict") + + def train(self): + return self.classify("train") + + def _get_types(self): + types = {} + for t in self.get_simulation_dependency(): + for k, v in t.output['types'].items(): + if k not in types: + types[k] = v + return types + + def _make_config(self, metadata_paths, lcdata_paths, mode, heatmaps_created): + config = {} + + # environment configuration + config["init_env_heatmaps"] = self.init_env_heatmaps + config["init_env"] = self.init_env + + # info for heatmap creation + if not heatmaps_created: + config["sbatch_header_path"] = self.heatmaps_sbatch_header_path + + config["heatmaps_donefile"] = self.heatmaps_done_file + config["heatmaps_logfile"] = self.heatmaps_log_path + config["sim_fraction"] = self.options.get("SIM_FRACTION", 1) # 1/sim_fraction % of simulated SNe will be used for the model + config["heatmaps_path"] = self.heatmaps_path + config["model_sbatch_job_path"] = self.model_sbatch_job_path + config["num_wavelength_bins"] = self.options.get("NUM_WAVELENGTH_BINS", 32) + config["num_mjd_bins"] = self.options.get("NUM_MJD_BINS", 180) + config["metadata_paths"] = metadata_paths + config["lcdata_paths"] = lcdata_paths + + # info for classification model + config["categorical"] = self.options.get("CATEGORICAL", False) + config["num_epochs"] = self.options.get("NUM_EPOCHS", 400) # TODO: replace num epochs with autostop: stop training when slope plateaus? + config["batch_size"] = self.options.get("BATCH_SIZE", 32) # TODO: replace with percentage of total size? + config["Ia_fraction"] = self.options.get("IA_FRACTION", 0.5) + config["output_path"] = self.output_dir + config["trained_model"] = self.options.get("MODEL", None) + config["kcor_file"] = self.options.get("KCOR_FILE", None) + config["mode"] = mode + config["job_base_name"] = self.job_base_name + config["class_balanced"] = (mode == "train") + + types = self._get_types() + if types is not None: + types = {int(k): v for k, v in types.items()} # sometimes the keys are strings, sometimes ints + self.logger.info(f"input types from sim found, types set to {types}") + config["sn_type_id_to_name"] = types + + return yaml.dump(config) + + def _check_completion(self, squeue): + if Path(self.done_file).exists(): + self.logger.debug(f"Found done file at {self.done_file}") + with open(self.done_file) as f: + if "SUCCESS" not in f.read().upper(): + return Task.FINISHED_FAILURE + + pred_path = str(Path(self.output_dir) / "predictions.csv") + predictions = pd.read_csv(pred_path) + if "pred_labels" in predictions.columns: + predictions = predictions[["snid", "pred_labels"]] # make sure snid is the first col + predictions = predictions.rename(columns={"pred_labels": self.get_prob_column_name()}) + predictions.to_csv(pred_path, index=False) + self.logger.info(f"Predictions file can be found at {pred_path}") + self.output.update({"model_filename": self.options.get("MODEL", str(Path(self.output_dir) / "trained_model")), "predictions_filename": pred_path}) + return Task.FINISHED_SUCCESS + return self.check_for_job(squeue, self.job_base_name) + + def _heatmap_creation_success(self): + if not Path(self.heatmaps_done_file).exists(): + return False + with open(self.heatmaps_done_file, "r") as donefile: + if "CREATE HEATMAPS FAILURE" in donefile.read(): + return False + return Path(self.heatmaps_path).exists() and (Path(self.heatmaps_path) / "done.log").exists() + + def num_jobs_in_queue(self): + squeue = [i.strip() for i in subprocess.check_output(f"squeue -h -u $USER -o '%.200j'", shell=True, text=True).splitlines()] + self.logger.debug(f"{squeue}") + return self.check_for_job(squeue, self.job_base_name) + + @staticmethod + def _get_lcdata_paths(sim_dirs): + lcdata_paths = [str(f.resolve()) for sim_dir in sim_dirs for f in Path(sim_dir).iterdir() if "PHOT" in f.name] + return lcdata_paths + + @staticmethod + def _update_header(header, header_dict): + for key, value in header_dict.items(): + if key in header: + header = header.replace(key, str(value)) + append_list = header_dict.get("APPEND") + if append_list is not None: + lines = header.split('\n') + lines += append_list + header = '\n'.join(lines) + return header + + @staticmethod + def get_requirements(options): + return True, False diff --git a/tests/config_files/cfg_dev.yml b/tests/config_files/cfg_dev.yml index 6f4221da..99a4fe1a 100644 --- a/tests/config_files/cfg_dev.yml +++ b/tests/config_files/cfg_dev.yml @@ -26,6 +26,11 @@ DataSkimmer: conda_env: snn_gpu location: $PRODUCTS/utilities/dataskim +SCONE: + init_env_cpu: source activate scone_cpu_tf2.6 + init_env_gpu: source activate scone_gpu_tf2.6 + location: $SCONE_DIR + CosmoMC: location: $PRODUCTS/CosmoMC/v03/CosmoMC-master static_loc: cosmomc_static_chains diff --git a/tests/config_files/valid_classify_scone.yml b/tests/config_files/valid_classify_scone.yml new file mode 100644 index 00000000..cf8a4888 --- /dev/null +++ b/tests/config_files/valid_classify_scone.yml @@ -0,0 +1,48 @@ +SIM: + EXAMPLESIM: + IA_G10_DES3YR: + BASE: surveys/sdss/sims_ia/sn_ia_g10_sdss_3yr.input + II: + BASE: surveys/sdss/sims_cc/sn_ii_templates.input + Ibc: + BASE: surveys/sdss/sims_cc/sn_ibc_templates.input + GLOBAL: + NGEN_UNIT: 1 + RANSEED_REPEAT: 10 12345 + SOLID_ANGLE: 10 + +LCFIT: + D: + BASE: surveys/des/lcfit_nml/des_5yr.nml + +CLASSIFICATION: + + LEGACY_SCONE_TRAIN: + CLASSIFIER: SconeClassifier + MODE: train + OPTS: + OPTIONAL_MASK_FIT: "D" + NUM_EPOCHS: 400 + + LEGACY_SCONE_PREDICT: + CLASSIFIER: SconeClassifier + MODE: predict + OPTS: + OPTIONAL_MASK_FIT: "D" + MODEL: 'LEGACY_SCONE_TRAIN' + + SCONE_TRAIN: + CLASSIFIER: SconeClassifier + MODE: train + BASE: "/path/to/base/file" + OPTS: + OPTIONAL_MASK_FIT: "D" + NUM_EPOCHS: 400 + + SCONE_PREDICT: + CLASSIFIER: SconeClassifier + MODE: predict + BASE: "/path/to/base/file" + OPTS: + OPTIONAL_MASK_FIT: "D" + MODEL: 'SCONE_TRAIN' diff --git a/tests/test_valid_config.py b/tests/test_valid_config.py index 88033ab1..05bdaaf2 100644 --- a/tests/test_valid_config.py +++ b/tests/test_valid_config.py @@ -7,6 +7,8 @@ from pippin.snana_fit import SNANALightCurveFit from pippin.classifiers.fitprob import FitProbClassifier from pippin.classifiers.perfect import PerfectClassifier +from pippin.classifiers.scone import SconeClassifier +from pippin.classifiers.scone_legacy import SconeLegacyClassifier from pippin.aggregator import Aggregator from pippin.merge import Merger from pippin.biascor import BiasCor @@ -150,6 +152,58 @@ def test_classifier_sim_with_opt_lcfit_config_valid(): assert isinstance(deps[0], SNANASimulation) assert isinstance(deps[1], SNANALightCurveFit) +def test_classifier_scone_valid(): + manager = get_manager(yaml="tests/config_files/valid_classify_scone.yml", check=True) + tasks = manager.tasks + + # 1 Sim, 1 LCFit, 4 Scone + assert len(tasks) == 6 + assert isinstance(tasks[0], SNANASimulation) + assert isinstance(tasks[1], SNANALightCurveFit) + for task in tasks[2:]: + # isinstance => Class or Subclass + assert isinstance(task, SconeClassifier) + + tests = [ + { + 'task': tasks[2], + 'cls': SconeLegacyClassifier, + 'attr': { + 'name': 'LEGACY_SCONE_TRAIN', + 'scone_input_file': None + } + }, + { + 'task': tasks[3], + 'cls': SconeLegacyClassifier, + 'attr': { + 'name': 'LEGACY_SCONE_PREDICT', + 'scone_input_file': None + } + }, + { + 'task': tasks[4], + 'cls': SconeClassifier, + 'attr': { + 'name': 'SCONE_TRAIN', + } + }, + { + 'task': tasks[5], + 'cls': SconeClassifier, + 'attr': { + 'name': 'SCONE_PREDICT', + } + } + ] + + for test in tests: + task = test['task'] + assert type(task) is test['cls'] + for (attr, val) in test['attr'].items(): + assert hasattr(task, attr) + assert getattr(task, attr) == val + def test_agg_config_valid(): # This shouldn't raise an error From 41487c2dc021e851a19bf23f4f6ee4a0dcec0137 Mon Sep 17 00:00:00 2001 From: Patrick Armstrong Date: Tue, 18 Jun 2024 20:05:45 -0500 Subject: [PATCH 2/2] Changed which run file each scone version uses --- pippin/classifiers/scone.py | 2 +- pippin/classifiers/scone_legacy.py | 7 +------ 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/pippin/classifiers/scone.py b/pippin/classifiers/scone.py index 324ee99a..395e6d89 100644 --- a/pippin/classifiers/scone.py +++ b/pippin/classifiers/scone.py @@ -14,7 +14,7 @@ # ========================================= -SCONE_SHELL_SCRIPT = "run_refactor.py" # top-level script under $SCONE_DIR +SCONE_SHELL_SCRIPT = "run.py" # top-level script under $SCONE_DIR KEYLIST_SCONE_INPUT = [ 'init_env_train', 'init_env_heatmaps', 'prescale_heatmaps', 'nevt_select_heatmaps', diff --git a/pippin/classifiers/scone_legacy.py b/pippin/classifiers/scone_legacy.py index 0a2fe8c2..35d8a153 100644 --- a/pippin/classifiers/scone_legacy.py +++ b/pippin/classifiers/scone_legacy.py @@ -176,13 +176,8 @@ def classify(self, mode): # TODO: nersc needs `module load esslurm` to sbatch gpu jobs, maybe make # this shell command to a file so diff systems can define their own - file_to_run = 'run.py' - if self.options.get("REFACTORED", False): - file_to_run = 'run_refactor.py' - elif self.options.get("LEGACY", False): - file_to_run = 'run_legacy.py' + file_to_run = 'run_legacy.py' path = Path(self.path_to_classifier) / file_to_run - path = path if path.exists() else Path(self.path_to_classifier) / 'run.py' cmd = f"python {str(path)} --config_path {self.config_path}" subprocess.run([cmd], shell=True) self.logger.info(f"Running command: {cmd}")