From 4ae450bc131d13d4825bf905c801fad28209cac2 Mon Sep 17 00:00:00 2001 From: GopiGugan Date: Fri, 12 May 2023 08:13:32 -0400 Subject: [PATCH] Handle edge cases (freyja) --- .gitignore | 1 + freyja/generate_summary.py | 23 +++++---- freyja/process.py | 98 ++++++++++++++++++++++++++------------ freyja/trim.py | 3 +- 4 files changed, 85 insertions(+), 40 deletions(-) diff --git a/.gitignore b/.gitignore index a8196aa..9be57e7 100644 --- a/.gitignore +++ b/.gitignore @@ -14,6 +14,7 @@ results/* data/tile-coords.csv *bin *.db +*.log # Fasta *\.fa diff --git a/freyja/generate_summary.py b/freyja/generate_summary.py index fc1af9c..1c070da 100644 --- a/freyja/generate_summary.py +++ b/freyja/generate_summary.py @@ -54,15 +54,20 @@ def parse_lin(self, tsvfile, threshold=0.01): results = [] other_freq = 0. for lname, est in zip(lineages, estimates): - freq = float(est) - if freq < threshold: - other_freq += freq - continue - # look up closest LOI - fullname = self.expand_lineage(lname) - match = self.match_lineage(fullname) - results.append({'sample': sample, 'name': lname, 'LOI': match, - 'frequency': float(est)}) + try: + freq = float(est) + if freq < threshold: + other_freq += freq + continue + # look up closest LOI + fullname = self.expand_lineage(lname) + match = self.match_lineage(fullname) + results.append({'sample': sample, 'name': lname, 'LOI': match, + 'frequency': float(est)}) + except ValueError: + # Empty file + results.append({'sample': sample, 'name': 'NA', 'LOI': 'NA', + 'frequency': -1000}) # report sum of lineages below threshold results.append({'sample': sample, 'name': 'minor', 'frequency': other_freq, 'LOI': 'minor'}) diff --git a/freyja/process.py b/freyja/process.py index 42d518d..9e735c2 100644 --- a/freyja/process.py +++ b/freyja/process.py @@ -11,6 +11,7 @@ import requests import pandas as pd import json +from math import isnan from trim import send_error_notification, rename_file from generate_summary import LinParser @@ -172,6 +173,9 @@ def summarize_run_data(usher_barcodes, update_barcodes, freyja_update, path, ind :param callback: function, option to print messages to the console """ + if callback: + callback(path) + if update_barcodes or not os.path.isfile(usher_barcodes): callback("Downloading usher barcodes...") url = 'https://raw.githubusercontent.com/andersen-lab/Freyja/main/freyja/data/usher_barcodes.csv' @@ -205,6 +209,9 @@ def summarize_run_data(usher_barcodes, update_barcodes, freyja_update, path, ind samples = [os.path.split(x[0])[1] for x in os.walk(results_dir) if x[0] is not results_dir] for sample in samples: + if callback: + callback(sample) + if sample == "Undetermined": callback("Ignoring Undetermined...") continue @@ -229,32 +236,33 @@ def summarize_run_data(usher_barcodes, update_barcodes, freyja_update, path, ind if len(varfiles) == 0: send_error_notification(message="Freyja did not process results for {} sample {}".format(results_dir, sample)) - var = pd.read_csv(varfiles[0], sep='\t') - var['MUT'], var['SAM'] = var.apply(lambda row: row['REF'] + str(row['POS']) + row['ALT'], axis=1), sample - var = var[['SAM', 'ALT_AA', 'MUT', 'ALT_DP', 'TOTAL_DP']] - var.columns = ['sample', 'mutation', 'nucleotide', 'count', 'coverage'] - var = var.fillna('') - - for variant in variants: - tab = var[var['nucleotide'].isin(muts[variant])] - tab = tab.reset_index(drop = True) - - # create more dictionaries - results = tab.to_dict(orient='index') - - if csv_summary[variant]['loi'] not in summarized_output['lineagesOfInterest']: - summarized_output['lineagesOfInterest'].update({csv_summary[variant]['loi']: {}}) - - if sample not in summarized_output['lineagesOfInterest'][csv_summary[variant]['loi']]: - summarized_output['lineagesOfInterest'][csv_summary[variant]['loi']].update({sample: {}}) + if len(variants) > 0: + var = pd.read_csv(varfiles[0], sep='\t') + var['MUT'], var['SAM'] = var.apply(lambda row: row['REF'] + str(row['POS']) + row['ALT'], axis=1), sample + var = var[['SAM', 'ALT_AA', 'MUT', 'ALT_DP', 'TOTAL_DP']] + var.columns = ['sample', 'mutation', 'nucleotide', 'count', 'coverage'] + var = var.fillna('') + + for variant in variants: + tab = var[var['nucleotide'].isin(muts[variant])] + tab = tab.reset_index(drop = True) - if variant not in summarized_output['lineagesOfInterest'][csv_summary[variant]['loi']][sample]: - summarized_output['lineagesOfInterest'][csv_summary[variant]['loi']][sample].update({variant: {'mutInfo': [], 'estimate': {}}}) + # create more dictionaries + results = tab.to_dict(orient='index') - summarized_output['lineagesOfInterest'][csv_summary[variant]['loi']][sample][variant]['estimate'].update({'est': csv_summary[variant]['frequency'], 'lower.95': '', 'upper.95': '', '__row': sample}) - - for _, record in results.items(): - summarized_output['lineagesOfInterest'][csv_summary[variant]['loi']][sample][variant]['mutInfo'].append(record) + if csv_summary[variant]['loi'] not in summarized_output['lineagesOfInterest']: + summarized_output['lineagesOfInterest'].update({csv_summary[variant]['loi']: {}}) + + if sample not in summarized_output['lineagesOfInterest'][csv_summary[variant]['loi']]: + summarized_output['lineagesOfInterest'][csv_summary[variant]['loi']].update({sample: {}}) + + if variant not in summarized_output['lineagesOfInterest'][csv_summary[variant]['loi']][sample]: + summarized_output['lineagesOfInterest'][csv_summary[variant]['loi']][sample].update({variant: {'mutInfo': [], 'estimate': {}}}) + + summarized_output['lineagesOfInterest'][csv_summary[variant]['loi']][sample][variant]['estimate'].update({'est': csv_summary[variant]['frequency'], 'lower.95': '', 'upper.95': '', '__row': sample}) + + for _, record in results.items(): + summarized_output['lineagesOfInterest'][csv_summary[variant]['loi']][sample][variant]['mutInfo'].append(record) # Handle 'minor' lineages and samples that had an error due to low coverage for lin in ['minor', 'NA']: @@ -273,10 +281,29 @@ def summarize_run_data(usher_barcodes, update_barcodes, freyja_update, path, ind try: site = meta_file.filter(regex="location\sname*") date = meta_file.filter(regex="collection\sdate") - metadata = {'sample': sample, 'lab': lab, 'coldate': date.loc[sample, date.columns.tolist()[0]], 'site': site.loc[sample, site.columns.tolist()[0]]} + # Check if date is NaN + try: + if isnan(date.loc[sample, date.columns.tolist()[0]]): + dt = '' + except TypeError: + if type(date.loc[sample, date.columns.tolist()[0]]) is not str: + dt = date.loc[sample, date.columns.tolist()[0]][0] + else: + dt = date.loc[sample, date.columns.tolist()[0]] + + try: + if isnan(site.loc[sample, site.columns.tolist()[0]]): + st = '' + except TypeError: + if type(site.loc[sample, site.columns.tolist()[0]]) is not str: + st = site.loc[sample, site.columns.tolist()[0]][0] + else: + st = site.loc[sample, site.columns.tolist()[0]] + + metadata = {'sample': sample, 'lab': lab, 'coldate': dt, 'site': st } metadata_list.append(metadata) except KeyError: - sys.stderr.write(f"Error: Metadata for sample {sample} from {path} couldn't be found in the metadata file") + sys.stderr.write(f"Error: Metadata for sample {sample} from {path} couldn't be found in the metadata file\n") if meta_file is not None: summarized_output.update({'metadata': metadata_list}) @@ -297,7 +324,7 @@ def parse_args(): parser = argparse.ArgumentParser( description="Runs minimap2.py on new data" ) - parser.add_argument('--db', type=str, default="data/gromstole.db", + parser.add_argument('--db', type=str, default="data/freyja.db", help="Path to the database") parser.add_argument('--indir', type=str, default="/home/wastewater/uploads", help="Path to the uploads directory") @@ -362,6 +389,10 @@ def parse_args(): cursor, connection = open_connection(args.db, callback=cb.callback) new_files, runs = get_files(cursor, files, args.ignore_list, args.check, callback=cb.callback) + if len(new_files) == 0: + cb.callback("No new data files") + exit(0) + # Write the paths to a temporary file paths = tempfile.NamedTemporaryFile('w', delete=False) for file_path in new_files: @@ -405,7 +436,10 @@ def parse_args(): basepath, sample = os.path.split(processed_path) lab, run = basepath.split(os.path.sep)[-2:] sname = sample.split('_')[0] - unique_samples.add(os.path.join(args.outdir, lab, run, sname)) + if os.path.basename(args.outdir) == lab: + unique_samples.add(os.path.join(args.outdir, run, sname)) + else: + unique_samples.add(os.path.join(args.outdir, lab, run, sname)) runs.add(basepath) # Generate the summary freyja csv files for each sample @@ -442,8 +476,12 @@ def parse_args(): summarize_run_data(args.barcodes, args.update_barcodes, args.freyja_update, run, args.indir, args.outdir, callback=cb.callback) - os.unlink(paths.name) - os.unlink(processed_files.name) + try: + os.remove(paths.name) + os.remove(processed_files.name) + except FileNotFoundError: + pass + connection.commit() connection.close() diff --git a/freyja/trim.py b/freyja/trim.py index 09d1c97..8499980 100644 --- a/freyja/trim.py +++ b/freyja/trim.py @@ -28,7 +28,7 @@ def send_error_notification(message): exit(-1) msg = MIMEMultipart("related") - msg['Subject'] = "ATTENTION: Error running gromstole pipeline" + msg['Subject'] = "ATTENTION: Error running freyja pipeline" msg['From'] = "Gromstole Notification <{}>".format(config["EMAIL_ADDRESS"]) msg['To'] = config["SEND_TO"] @@ -265,6 +265,7 @@ def rename_file(filepath): # Check to see if the results directory has freyja run data if len(fnmatch.filter(os.listdir(result_dir), '*.*')) > 0: + cb.callback("Files already exist in the folder") continue tf1, tf2 = cutadapt(fq1=r1, fq2=r2, ncores=2, path=args.cutadapt, sendemail=args.sendemail)