Skip to content

Commit

Permalink
Handle edge cases (freyja)
Browse files Browse the repository at this point in the history
  • Loading branch information
GopiGugan committed May 12, 2023
1 parent 6a08077 commit 4ae450b
Show file tree
Hide file tree
Showing 4 changed files with 85 additions and 40 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ results/*
data/tile-coords.csv
*bin
*.db
*.log

# Fasta
*\.fa
Expand Down
23 changes: 14 additions & 9 deletions freyja/generate_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,15 +54,20 @@ def parse_lin(self, tsvfile, threshold=0.01):
results = []
other_freq = 0.
for lname, est in zip(lineages, estimates):
freq = float(est)
if freq < threshold:
other_freq += freq
continue
# look up closest LOI
fullname = self.expand_lineage(lname)
match = self.match_lineage(fullname)
results.append({'sample': sample, 'name': lname, 'LOI': match,
'frequency': float(est)})
try:
freq = float(est)
if freq < threshold:
other_freq += freq
continue
# look up closest LOI
fullname = self.expand_lineage(lname)
match = self.match_lineage(fullname)
results.append({'sample': sample, 'name': lname, 'LOI': match,
'frequency': float(est)})
except ValueError:
# Empty file
results.append({'sample': sample, 'name': 'NA', 'LOI': 'NA',
'frequency': -1000})

# report sum of lineages below threshold
results.append({'sample': sample, 'name': 'minor', 'frequency': other_freq, 'LOI': 'minor'})
Expand Down
98 changes: 68 additions & 30 deletions freyja/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import requests
import pandas as pd
import json
from math import isnan

from trim import send_error_notification, rename_file
from generate_summary import LinParser
Expand Down Expand Up @@ -172,6 +173,9 @@ def summarize_run_data(usher_barcodes, update_barcodes, freyja_update, path, ind
:param callback: function, option to print messages to the console
"""

if callback:
callback(path)

if update_barcodes or not os.path.isfile(usher_barcodes):
callback("Downloading usher barcodes...")
url = 'https://raw.githubusercontent.com/andersen-lab/Freyja/main/freyja/data/usher_barcodes.csv'
Expand Down Expand Up @@ -205,6 +209,9 @@ def summarize_run_data(usher_barcodes, update_barcodes, freyja_update, path, ind
samples = [os.path.split(x[0])[1] for x in os.walk(results_dir) if x[0] is not results_dir]

for sample in samples:
if callback:
callback(sample)

if sample == "Undetermined":
callback("Ignoring Undetermined...")
continue
Expand All @@ -229,32 +236,33 @@ def summarize_run_data(usher_barcodes, update_barcodes, freyja_update, path, ind
if len(varfiles) == 0:
send_error_notification(message="Freyja did not process results for {} sample {}".format(results_dir, sample))

var = pd.read_csv(varfiles[0], sep='\t')
var['MUT'], var['SAM'] = var.apply(lambda row: row['REF'] + str(row['POS']) + row['ALT'], axis=1), sample
var = var[['SAM', 'ALT_AA', 'MUT', 'ALT_DP', 'TOTAL_DP']]
var.columns = ['sample', 'mutation', 'nucleotide', 'count', 'coverage']
var = var.fillna('')

for variant in variants:
tab = var[var['nucleotide'].isin(muts[variant])]
tab = tab.reset_index(drop = True)

# create more dictionaries
results = tab.to_dict(orient='index')

if csv_summary[variant]['loi'] not in summarized_output['lineagesOfInterest']:
summarized_output['lineagesOfInterest'].update({csv_summary[variant]['loi']: {}})

if sample not in summarized_output['lineagesOfInterest'][csv_summary[variant]['loi']]:
summarized_output['lineagesOfInterest'][csv_summary[variant]['loi']].update({sample: {}})
if len(variants) > 0:
var = pd.read_csv(varfiles[0], sep='\t')
var['MUT'], var['SAM'] = var.apply(lambda row: row['REF'] + str(row['POS']) + row['ALT'], axis=1), sample
var = var[['SAM', 'ALT_AA', 'MUT', 'ALT_DP', 'TOTAL_DP']]
var.columns = ['sample', 'mutation', 'nucleotide', 'count', 'coverage']
var = var.fillna('')

for variant in variants:
tab = var[var['nucleotide'].isin(muts[variant])]
tab = tab.reset_index(drop = True)

if variant not in summarized_output['lineagesOfInterest'][csv_summary[variant]['loi']][sample]:
summarized_output['lineagesOfInterest'][csv_summary[variant]['loi']][sample].update({variant: {'mutInfo': [], 'estimate': {}}})
# create more dictionaries
results = tab.to_dict(orient='index')

summarized_output['lineagesOfInterest'][csv_summary[variant]['loi']][sample][variant]['estimate'].update({'est': csv_summary[variant]['frequency'], 'lower.95': '', 'upper.95': '', '__row': sample})

for _, record in results.items():
summarized_output['lineagesOfInterest'][csv_summary[variant]['loi']][sample][variant]['mutInfo'].append(record)
if csv_summary[variant]['loi'] not in summarized_output['lineagesOfInterest']:
summarized_output['lineagesOfInterest'].update({csv_summary[variant]['loi']: {}})

if sample not in summarized_output['lineagesOfInterest'][csv_summary[variant]['loi']]:
summarized_output['lineagesOfInterest'][csv_summary[variant]['loi']].update({sample: {}})

if variant not in summarized_output['lineagesOfInterest'][csv_summary[variant]['loi']][sample]:
summarized_output['lineagesOfInterest'][csv_summary[variant]['loi']][sample].update({variant: {'mutInfo': [], 'estimate': {}}})

summarized_output['lineagesOfInterest'][csv_summary[variant]['loi']][sample][variant]['estimate'].update({'est': csv_summary[variant]['frequency'], 'lower.95': '', 'upper.95': '', '__row': sample})

for _, record in results.items():
summarized_output['lineagesOfInterest'][csv_summary[variant]['loi']][sample][variant]['mutInfo'].append(record)

# Handle 'minor' lineages and samples that had an error due to low coverage
for lin in ['minor', 'NA']:
Expand All @@ -273,10 +281,29 @@ def summarize_run_data(usher_barcodes, update_barcodes, freyja_update, path, ind
try:
site = meta_file.filter(regex="location\sname*")
date = meta_file.filter(regex="collection\sdate")
metadata = {'sample': sample, 'lab': lab, 'coldate': date.loc[sample, date.columns.tolist()[0]], 'site': site.loc[sample, site.columns.tolist()[0]]}
# Check if date is NaN
try:
if isnan(date.loc[sample, date.columns.tolist()[0]]):
dt = ''
except TypeError:
if type(date.loc[sample, date.columns.tolist()[0]]) is not str:
dt = date.loc[sample, date.columns.tolist()[0]][0]
else:
dt = date.loc[sample, date.columns.tolist()[0]]

try:
if isnan(site.loc[sample, site.columns.tolist()[0]]):
st = ''
except TypeError:
if type(site.loc[sample, site.columns.tolist()[0]]) is not str:
st = site.loc[sample, site.columns.tolist()[0]][0]
else:
st = site.loc[sample, site.columns.tolist()[0]]

metadata = {'sample': sample, 'lab': lab, 'coldate': dt, 'site': st }
metadata_list.append(metadata)
except KeyError:
sys.stderr.write(f"Error: Metadata for sample {sample} from {path} couldn't be found in the metadata file")
sys.stderr.write(f"Error: Metadata for sample {sample} from {path} couldn't be found in the metadata file\n")

if meta_file is not None:
summarized_output.update({'metadata': metadata_list})
Expand All @@ -297,7 +324,7 @@ def parse_args():
parser = argparse.ArgumentParser(
description="Runs minimap2.py on new data"
)
parser.add_argument('--db', type=str, default="data/gromstole.db",
parser.add_argument('--db', type=str, default="data/freyja.db",
help="Path to the database")
parser.add_argument('--indir', type=str, default="/home/wastewater/uploads",
help="Path to the uploads directory")
Expand Down Expand Up @@ -362,6 +389,10 @@ def parse_args():
cursor, connection = open_connection(args.db, callback=cb.callback)
new_files, runs = get_files(cursor, files, args.ignore_list, args.check, callback=cb.callback)

if len(new_files) == 0:
cb.callback("No new data files")
exit(0)

# Write the paths to a temporary file
paths = tempfile.NamedTemporaryFile('w', delete=False)
for file_path in new_files:
Expand Down Expand Up @@ -405,7 +436,10 @@ def parse_args():
basepath, sample = os.path.split(processed_path)
lab, run = basepath.split(os.path.sep)[-2:]
sname = sample.split('_')[0]
unique_samples.add(os.path.join(args.outdir, lab, run, sname))
if os.path.basename(args.outdir) == lab:
unique_samples.add(os.path.join(args.outdir, run, sname))
else:
unique_samples.add(os.path.join(args.outdir, lab, run, sname))
runs.add(basepath)

# Generate the summary freyja csv files for each sample
Expand Down Expand Up @@ -442,8 +476,12 @@ def parse_args():
summarize_run_data(args.barcodes, args.update_barcodes, args.freyja_update, run, args.indir, args.outdir, callback=cb.callback)


os.unlink(paths.name)
os.unlink(processed_files.name)
try:
os.remove(paths.name)
os.remove(processed_files.name)
except FileNotFoundError:
pass


connection.commit()
connection.close()
Expand Down
3 changes: 2 additions & 1 deletion freyja/trim.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def send_error_notification(message):
exit(-1)

msg = MIMEMultipart("related")
msg['Subject'] = "ATTENTION: Error running gromstole pipeline"
msg['Subject'] = "ATTENTION: Error running freyja pipeline"
msg['From'] = "Gromstole Notification <{}>".format(config["EMAIL_ADDRESS"])
msg['To'] = config["SEND_TO"]

Expand Down Expand Up @@ -265,6 +265,7 @@ def rename_file(filepath):

# Check to see if the results directory has freyja run data
if len(fnmatch.filter(os.listdir(result_dir), '*.*')) > 0:
cb.callback("Files already exist in the folder")
continue

tf1, tf2 = cutadapt(fq1=r1, fq2=r2, ncores=2, path=args.cutadapt, sendemail=args.sendemail)
Expand Down

0 comments on commit 4ae450b

Please sign in to comment.