Merge pull request #32 from cidgoh/madeline

unknown strains, elementwise addition of counts for heterozygous mutations, hopeful bug fix
cidgoh · Oct 18, 2021 · a4433d2 · a4433d2
2 parents c0f130b + e65cde6
commit a4433d2
Show file tree

Hide file tree

Showing 3 changed files with 90 additions and 23 deletions.
diff --git a/README.md b/README.md
@@ -1 +1,15 @@
 # nf-ncov-voc
+
+**Genomic Analysis**
+
+**Functional Annotation**
+
+In this module, the genomic data for each lineage is converted to a GVF (Genomic Variant Format) file and annotated with functional information.  GVF is a variant of GFF3 that is standardized for describing genomic mutations; it is used here because it can describe mutations across multiple rows, and because the "#attributes" column can store information in custom key-value pairs.  The key-value pairs added at this stage include for each mutation: VOC/VOI status, clade-defining status (for reference lineages), and functional annotations parsed from Paul Gordon's Pokay repository.  The annotated GVFs are the input to the visualization module.
+
+**Surveillance Report**
+
+A TSV file can be produced that summarizes mutation information for SARS-CoV-2 variants.  This file contains much of the same information found in the GVF files in a more human-readable format, with all reference lineages per variant merged into one concise TSV.
+
+**Visualization**
+
+
diff --git a/bin/gvf2tsv.py b/bin/gvf2tsv.py
@@ -59,18 +59,6 @@ def gvf2tsv(gvf):
     #rename 'dp' column to 'sequence_depth', make 'viral_lineage' plural
     df = df.rename(columns={'dp': 'sequence_depth', 'viral_lineage': 'viral_lineages'})
 
-    #reorder columns
-    cols = ['name', 'nt_name', 'aa_name', 'multi_aa_name', 
-       'multiaa_comb_mutation', 'start', 'vcf_gene', 'chrom_region',
-       'mutation_type', 'sequence_depth', 'ps_filter', 'ps_exc', 'mat_pep_id',
-       'mat_pep_desc', 'mat_pep_acc', 'ro', 'ao', 'reference_seq',
-       'variant_seq', 'viral_lineages', 'function_category', 'citation',
-       'comb_mutation', 'function_description', 'heterozygosity',
-       'clade_defining', 'who_label', 'variant', 'variant_status',
-       'voi_designation_date', 'voc_designation_date',
-       'alert_designation_date']
-    df = df[cols]
-
     return df
 
 
@@ -81,12 +69,73 @@ def gvf2tsv(gvf):
     filepath = args.outtsv
     gvf_files = args.gvf #this is a list of strings
 
+    #convert all gvf files to tsv and concatenate them
+    print("Processing:")
+    print(gvf_files[0])
     tsv_df = gvf2tsv(gvf_files[0])
-
     for gvf in gvf_files[1:]:
+        print(gvf)
         new_tsv_df = gvf2tsv(gvf)
-        tsv_df = pd.concat([tsv_df, new_tsv_df], ignore_index=True)
+        tsv_df = pd.concat([tsv_df, new_tsv_df], ignore_index=True)   
+
+
+    #find identical rows across strains, and keep only one row.
+
+    #change n/a to 0 in 'ao' for counting purposes
+    tsv_df['ao'] = tsv_df['ao'].str.replace("n/a", "0")
+
+    for colname in ['sequence_depth', 'ao', 'ro']:
+        #split up at commas into new columns: make a new mini-df
+        split_series = tsv_df[colname].str.split(pat=',').apply(pd.Series)
+        #rename series columns to 'ao_0', 'a0_1', etc.
+        split_series.columns = [colname + '_' + str(name) for name in split_series.columns.values]
+        #ensure all counts are numeric
+        for column in split_series.columns:
+            split_series[column] = pd.to_numeric(split_series[column], errors='coerce')
+        #append series to tsv_df
+        tsv_df = pd.concat([tsv_df, split_series], axis=1)
+
+    cols_to_check = ['name', 'nt_name', 'aa_name', 'multi_aa_name', 'multiaa_comb_mutation', 'start', 'function_category', 'citation', 'comb_mutation', 'function_description', 'heterozygosity']
+
+    agg_dict = dict((col,'first') for col in tsv_df.columns.values.tolist())
+    agg_dict['viral_lineages'] = ', '.join
+    agg_dict['clade_defining'] = ', '.join
+
+    #sum split columns
+    for string in ['sequence_depth_', 'ao_', 'ro_']:
+        relevant_keys = [key for key, value in agg_dict.items() if string in key.lower()]
+        for key in relevant_keys:
+            agg_dict[key] = 'sum'
+
+    final_df = tsv_df.groupby(cols_to_check).agg(agg_dict)
+
+    #rejoin split columns (ao, sequence_depth, ro) with comma separation
+    for string in ['sequence_depth_', 'ao_', 'ro_']:
+        colnames = [i for i in tsv_df.columns.values.tolist() if string in i]
+        final_df[string + 'combined'] = final_df[colnames].apply(lambda row: ','.join(row.values.astype(str)), axis=1)
+        #drop split columns
+        final_df = final_df.drop(labels=colnames, axis=1)
+
+    #replace ao, ro, sequence_depth with the added up columns
+    final_df = final_df.drop(labels=['ao', 'ro', 'sequence_depth'], axis=1)
+    final_df = final_df.rename(columns={'sequence_depth_combined': 'sequence_depth', 'ro_combined': 'ro', 'ao_combined': 'ao'})
 
-    tsv_df.to_csv(filepath, sep='\t', index=False)
+    #reorder columns
+    cols = ['name', 'nt_name', 'aa_name', 'multi_aa_name', 
+       'multiaa_comb_mutation', 'start', 'vcf_gene', 'chrom_region',
+       'mutation_type', 'sequence_depth', 'ps_filter', 'ps_exc', 'mat_pep_id',
+       'mat_pep_desc', 'mat_pep_acc', 'ro', 'ao', 'reference_seq',
+       'variant_seq', 'viral_lineages', 'function_category', 'citation',
+       'comb_mutation', 'function_description', 'heterozygosity',
+       'clade_defining', 'who_label', 'variant', 'variant_status',
+       'voi_designation_date', 'voc_designation_date',
+       'alert_designation_date']
+    final_df = final_df[cols]
 
-    print("Saved as: " + filepath)
+    #save the final df to a .tsv
+    final_df.to_csv(filepath, sep='\t', index=False)
+    print("")
+    print("Processing complete.")
+    print("Saved as: " + filepath)
+
+
diff --git a/bin/vcf2gvf.py b/bin/vcf2gvf.py
@@ -35,7 +35,7 @@ def parse_args():
                         default=None,
                         help='.tsv of multi-aa mutation names to split up into individual aa names')
     parser.add_argument('--strain', type=str,
-                        default=None,
+                        default='n/a',
                         help='lineage')
     parser.add_argument('--outvcf', type=str,
                         help='Filename for the output GVF file')
@@ -128,7 +128,9 @@ def vcftogvf(var_data, strain, GENE_POSITIONS_DICT, names_to_split):
     info = df['INFO'].str.split(pat=';').apply(pd.Series) #split at ;, form dataframe
     for column in info.columns:
         split = info[column].str.split(pat='=').apply(pd.Series)
-        title = split[0].drop_duplicates().tolist()[0].lower()
+        title = split[0].drop_duplicates().tolist()[0]
+        if isinstance(title, str):
+            title = title.lower()
         content = split[1]
         info[column] = content #ignore "tag=" in column content
         info.rename(columns={column:title}, inplace=True) #make attribute tag as column label
@@ -285,7 +287,9 @@ def add_functions(gvf, annotation_file, clade_file, strain):
         merged_df["#attributes"] = merged_df["#attributes"].astype(str) + "voi_designation_date=" + voi_designation_date + ';'
         merged_df["#attributes"] = merged_df["#attributes"].astype(str) + "voc_designation_date=" + voc_designation_date + ';'
         merged_df["#attributes"] = merged_df["#attributes"].astype(str) + "alert_designation_date=" + alert_designation_date + ';'
-
+    else:
+        merged_df["#attributes"] = merged_df["#attributes"].astype(str)  + "clade_defining=n/a;" + "who_label=n/a;" + "variant=n/a;" + "variant_status=n/a;" + "voi_designation_date=n/a;" + "voc_designation_date=n/a;" + "alert_designation_date=n/a;"
+
 
     #add ID to attributes
     merged_df["#attributes"] = 'ID=' + merged_df['id'].astype(str) + ';' + merged_df["#attributes"].astype(str)
@@ -326,6 +330,7 @@ def add_functions(gvf, annotation_file, clade_file, strain):
 
     file = args.vcffile
 
+
     print("Processing: " + file)
 
     #create gvf from annotated vcf (ignoring pragmas for now)
@@ -352,22 +357,21 @@ def add_functions(gvf, annotation_file, clade_file, strain):
         all_strains_mutations.append(mutations)
         leftover_df = leftover_df.append(leftover_names)
         unmatched_clade_names = unmatched_clade_names.append(leftover_clade_names)
-
         #save unmatched names (in tsv but not in functional_annotations) across all strains to a .tsv file
-        leftover_names_filepath = args.strain + "_leftover_names.tsv"
+        leftover_names_filepath = "leftover_names.tsv"
         leftover_df.to_csv(leftover_names_filepath, sep='\t', index=False)
         print("")
         print("Mutation names not found in functional annotations file saved to " + leftover_names_filepath)
 
         #save unmatched clade-defining mutation names to a .tsv file
-        leftover_clade_names_filepath = args.strain + "_leftover_clade_defining_names.tsv"
+        leftover_clade_names_filepath = "leftover_clade_defining_names.tsv"
         unmatched_clade_names.to_csv(leftover_clade_names_filepath, sep='\t', index=False)
         print("Clade-defining mutation names not found in the annotated VCF saved to " + leftover_clade_names_filepath)
 
         #print number of unique mutations across all strains    
         flattened = [val for sublist in all_strains_mutations for val in sublist]
         arr = np.array(flattened)
-        print("# unique mutations in " + args.strain + " VCF file: ", np.unique(arr).shape[0])
+        print("# unique mutations in VCF file: ", np.unique(arr).shape[0])
 
     print("")        
     print("Processing complete.")