From 4ae450bc131d13d4825bf905c801fad28209cac2 Mon Sep 17 00:00:00 2001
From: GopiGugan <ggugan@uwo.ca>
Date: Fri, 12 May 2023 08:13:32 -0400
Subject: [PATCH] Handle edge cases (freyja)

---
 .gitignore                 |  1 +
 freyja/generate_summary.py | 23 +++++----
 freyja/process.py          | 98 ++++++++++++++++++++++++++------------
 freyja/trim.py             |  3 +-
 4 files changed, 85 insertions(+), 40 deletions(-)

diff --git a/.gitignore b/.gitignore
index a8196aa..9be57e7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,6 +14,7 @@ results/*
 data/tile-coords.csv
 *bin
 *.db
+*.log
 
 # Fasta
 *\.fa
diff --git a/freyja/generate_summary.py b/freyja/generate_summary.py
index fc1af9c..1c070da 100644
--- a/freyja/generate_summary.py
+++ b/freyja/generate_summary.py
@@ -54,15 +54,20 @@ def parse_lin(self, tsvfile, threshold=0.01):
         results = []
         other_freq = 0.
         for lname, est in zip(lineages, estimates):
-            freq = float(est)
-            if freq < threshold:
-                other_freq += freq
-                continue
-            # look up closest LOI
-            fullname = self.expand_lineage(lname)
-            match = self.match_lineage(fullname)
-            results.append({'sample': sample, 'name': lname, 'LOI': match,
-                            'frequency': float(est)})
+            try:
+                freq = float(est)
+                if freq < threshold:
+                    other_freq += freq
+                    continue
+                # look up closest LOI
+                fullname = self.expand_lineage(lname)
+                match = self.match_lineage(fullname)
+                results.append({'sample': sample, 'name': lname, 'LOI': match,
+                                'frequency': float(est)})
+            except ValueError:
+                # Empty file
+                results.append({'sample': sample, 'name': 'NA', 'LOI': 'NA',
+                                'frequency': -1000})
 
         # report sum of lineages below threshold
         results.append({'sample': sample, 'name': 'minor', 'frequency': other_freq, 'LOI': 'minor'})
diff --git a/freyja/process.py b/freyja/process.py
index 42d518d..9e735c2 100644
--- a/freyja/process.py
+++ b/freyja/process.py
@@ -11,6 +11,7 @@
 import requests
 import pandas as pd
 import json
+from math import isnan
 
 from trim import send_error_notification, rename_file
 from generate_summary import LinParser
@@ -172,6 +173,9 @@ def summarize_run_data(usher_barcodes, update_barcodes, freyja_update, path, ind
     :param callback: function, option to print messages to the console
     """
 
+    if callback:
+        callback(path)
+
     if update_barcodes or not os.path.isfile(usher_barcodes):
         callback("Downloading usher barcodes...")
         url = 'https://raw.githubusercontent.com/andersen-lab/Freyja/main/freyja/data/usher_barcodes.csv'
@@ -205,6 +209,9 @@ def summarize_run_data(usher_barcodes, update_barcodes, freyja_update, path, ind
     samples = [os.path.split(x[0])[1] for x in os.walk(results_dir) if x[0] is not results_dir]
 
     for sample in samples:
+        if callback:
+            callback(sample)
+
         if sample == "Undetermined":
             callback("Ignoring Undetermined...")
             continue
@@ -229,32 +236,33 @@ def summarize_run_data(usher_barcodes, update_barcodes, freyja_update, path, ind
         if len(varfiles) == 0:
             send_error_notification(message="Freyja did not process results for {} sample {}".format(results_dir, sample))
 
-        var = pd.read_csv(varfiles[0], sep='\t')
-        var['MUT'], var['SAM'] = var.apply(lambda row: row['REF'] + str(row['POS']) + row['ALT'], axis=1), sample
-        var = var[['SAM', 'ALT_AA', 'MUT', 'ALT_DP', 'TOTAL_DP']]
-        var.columns = ['sample', 'mutation', 'nucleotide', 'count', 'coverage']
-        var = var.fillna('')
-
-        for variant in variants:
-            tab = var[var['nucleotide'].isin(muts[variant])]
-            tab = tab.reset_index(drop = True)
-			
-            # create more dictionaries
-            results = tab.to_dict(orient='index')
-            
-            if csv_summary[variant]['loi'] not in summarized_output['lineagesOfInterest']:
-                summarized_output['lineagesOfInterest'].update({csv_summary[variant]['loi']: {}})
-			
-            if sample not in summarized_output['lineagesOfInterest'][csv_summary[variant]['loi']]:
-                summarized_output['lineagesOfInterest'][csv_summary[variant]['loi']].update({sample: {}})
+        if len(variants) > 0:
+            var = pd.read_csv(varfiles[0], sep='\t')
+            var['MUT'], var['SAM'] = var.apply(lambda row: row['REF'] + str(row['POS']) + row['ALT'], axis=1), sample
+            var = var[['SAM', 'ALT_AA', 'MUT', 'ALT_DP', 'TOTAL_DP']]
+            var.columns = ['sample', 'mutation', 'nucleotide', 'count', 'coverage']
+            var = var.fillna('')
+
+            for variant in variants:
+                tab = var[var['nucleotide'].isin(muts[variant])]
+                tab = tab.reset_index(drop = True)
                 
-            if variant not in summarized_output['lineagesOfInterest'][csv_summary[variant]['loi']][sample]:
-                summarized_output['lineagesOfInterest'][csv_summary[variant]['loi']][sample].update({variant: {'mutInfo': [], 'estimate': {}}})
+                # create more dictionaries
+                results = tab.to_dict(orient='index')
                 
-            summarized_output['lineagesOfInterest'][csv_summary[variant]['loi']][sample][variant]['estimate'].update({'est': csv_summary[variant]['frequency'], 'lower.95': '', 'upper.95': '', '__row': sample})
-            
-            for _, record in results.items():
-                summarized_output['lineagesOfInterest'][csv_summary[variant]['loi']][sample][variant]['mutInfo'].append(record)
+                if csv_summary[variant]['loi'] not in summarized_output['lineagesOfInterest']:
+                    summarized_output['lineagesOfInterest'].update({csv_summary[variant]['loi']: {}})
+                
+                if sample not in summarized_output['lineagesOfInterest'][csv_summary[variant]['loi']]:
+                    summarized_output['lineagesOfInterest'][csv_summary[variant]['loi']].update({sample: {}})
+                    
+                if variant not in summarized_output['lineagesOfInterest'][csv_summary[variant]['loi']][sample]:
+                    summarized_output['lineagesOfInterest'][csv_summary[variant]['loi']][sample].update({variant: {'mutInfo': [], 'estimate': {}}})
+                    
+                summarized_output['lineagesOfInterest'][csv_summary[variant]['loi']][sample][variant]['estimate'].update({'est': csv_summary[variant]['frequency'], 'lower.95': '', 'upper.95': '', '__row': sample})
+                
+                for _, record in results.items():
+                    summarized_output['lineagesOfInterest'][csv_summary[variant]['loi']][sample][variant]['mutInfo'].append(record)
 
         # Handle 'minor' lineages and samples that had an error due to low coverage
         for lin in ['minor', 'NA']:
@@ -273,10 +281,29 @@ def summarize_run_data(usher_barcodes, update_barcodes, freyja_update, path, ind
             try: 
                 site = meta_file.filter(regex="location\sname*")
                 date = meta_file.filter(regex="collection\sdate")
-                metadata = {'sample': sample, 'lab': lab, 'coldate': date.loc[sample, date.columns.tolist()[0]], 'site': site.loc[sample, site.columns.tolist()[0]]}
+                # Check if date is NaN
+                try:
+                    if isnan(date.loc[sample, date.columns.tolist()[0]]):
+                        dt = ''
+                except TypeError:
+                    if type(date.loc[sample, date.columns.tolist()[0]]) is not str:
+                        dt = date.loc[sample, date.columns.tolist()[0]][0]
+                    else:
+                        dt = date.loc[sample, date.columns.tolist()[0]]
+                
+                try:
+                    if isnan(site.loc[sample, site.columns.tolist()[0]]):
+                        st = ''
+                except TypeError:
+                    if type(site.loc[sample, site.columns.tolist()[0]]) is not str:
+                        st = site.loc[sample, site.columns.tolist()[0]][0]
+                    else:
+                        st = site.loc[sample, site.columns.tolist()[0]]
+
+                metadata = {'sample': sample, 'lab': lab, 'coldate': dt, 'site': st }
                 metadata_list.append(metadata)
             except KeyError:
-                sys.stderr.write(f"Error: Metadata for sample {sample} from {path} couldn't be found in the metadata file")        
+                sys.stderr.write(f"Error: Metadata for sample {sample} from {path} couldn't be found in the metadata file\n")        
 
     if meta_file is not None:
         summarized_output.update({'metadata': metadata_list})
@@ -297,7 +324,7 @@ def parse_args():
     parser = argparse.ArgumentParser(
         description="Runs minimap2.py on new data"
     )
-    parser.add_argument('--db', type=str, default="data/gromstole.db",
+    parser.add_argument('--db', type=str, default="data/freyja.db",
                         help="Path to the database")
     parser.add_argument('--indir', type=str, default="/home/wastewater/uploads",
                         help="Path to the uploads directory")
@@ -362,6 +389,10 @@ def parse_args():
         cursor, connection = open_connection(args.db, callback=cb.callback)
         new_files, runs = get_files(cursor, files, args.ignore_list, args.check, callback=cb.callback)
 
+        if len(new_files) == 0:
+            cb.callback("No new data files")
+            exit(0)
+
         # Write the paths to a temporary file
         paths = tempfile.NamedTemporaryFile('w', delete=False)
         for file_path in new_files:
@@ -405,7 +436,10 @@ def parse_args():
                 basepath, sample = os.path.split(processed_path)
                 lab, run = basepath.split(os.path.sep)[-2:]
                 sname = sample.split('_')[0]
-                unique_samples.add(os.path.join(args.outdir, lab, run, sname))
+                if os.path.basename(args.outdir) == lab:
+                    unique_samples.add(os.path.join(args.outdir, run, sname))
+                else:
+                    unique_samples.add(os.path.join(args.outdir, lab, run, sname))
                 runs.add(basepath)
                 
         # Generate the summary freyja csv files for each sample
@@ -442,8 +476,12 @@ def parse_args():
             summarize_run_data(args.barcodes, args.update_barcodes, args.freyja_update, run, args.indir, args.outdir, callback=cb.callback)
 
 
-        os.unlink(paths.name)
-        os.unlink(processed_files.name)
+        try:
+            os.remove(paths.name)
+            os.remove(processed_files.name)
+        except FileNotFoundError:
+            pass
+
 
         connection.commit()
         connection.close()  
diff --git a/freyja/trim.py b/freyja/trim.py
index 09d1c97..8499980 100644
--- a/freyja/trim.py
+++ b/freyja/trim.py
@@ -28,7 +28,7 @@ def send_error_notification(message):
         exit(-1)
 
     msg = MIMEMultipart("related")
-    msg['Subject'] = "ATTENTION: Error running gromstole pipeline"
+    msg['Subject'] = "ATTENTION: Error running freyja pipeline"
     msg['From'] = "Gromstole Notification <{}>".format(config["EMAIL_ADDRESS"])
     msg['To'] = config["SEND_TO"]
 
@@ -265,6 +265,7 @@ def rename_file(filepath):
 
         # Check to see if the results directory has freyja run data
         if len(fnmatch.filter(os.listdir(result_dir), '*.*')) > 0:
+            cb.callback("Files already exist in the folder")
             continue
 
         tf1, tf2 = cutadapt(fq1=r1, fq2=r2, ncores=2, path=args.cutadapt, sendemail=args.sendemail)