Skip to content

Commit

Permalink
Merge pull request #32 from cidgoh/madeline
Browse files Browse the repository at this point in the history
unknown strains, elementwise addition of counts for heterozygous mutations, hopeful bug fix
  • Loading branch information
anwarMZ authored Oct 18, 2021
2 parents c0f130b + e65cde6 commit a4433d2
Show file tree
Hide file tree
Showing 3 changed files with 90 additions and 23 deletions.
14 changes: 14 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1 +1,15 @@
# nf-ncov-voc

**Genomic Analysis**

**Functional Annotation**

In this module, the genomic data for each lineage is converted to a GVF (Genomic Variant Format) file and annotated with functional information. GVF is a variant of GFF3 that is standardized for describing genomic mutations; it is used here because it can describe mutations across multiple rows, and because the "#attributes" column can store information in custom key-value pairs. The key-value pairs added at this stage include for each mutation: VOC/VOI status, clade-defining status (for reference lineages), and functional annotations parsed from Paul Gordon's Pokay repository. The annotated GVFs are the input to the visualization module.

**Surveillance Report**

A TSV file can be produced that summarizes mutation information for SARS-CoV-2 variants. This file contains much of the same information found in the GVF files in a more human-readable format, with all reference lineages per variant merged into one concise TSV.

**Visualization**


81 changes: 65 additions & 16 deletions bin/gvf2tsv.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,18 +59,6 @@ def gvf2tsv(gvf):
#rename 'dp' column to 'sequence_depth', make 'viral_lineage' plural
df = df.rename(columns={'dp': 'sequence_depth', 'viral_lineage': 'viral_lineages'})

#reorder columns
cols = ['name', 'nt_name', 'aa_name', 'multi_aa_name',
'multiaa_comb_mutation', 'start', 'vcf_gene', 'chrom_region',
'mutation_type', 'sequence_depth', 'ps_filter', 'ps_exc', 'mat_pep_id',
'mat_pep_desc', 'mat_pep_acc', 'ro', 'ao', 'reference_seq',
'variant_seq', 'viral_lineages', 'function_category', 'citation',
'comb_mutation', 'function_description', 'heterozygosity',
'clade_defining', 'who_label', 'variant', 'variant_status',
'voi_designation_date', 'voc_designation_date',
'alert_designation_date']
df = df[cols]

return df


Expand All @@ -81,12 +69,73 @@ def gvf2tsv(gvf):
filepath = args.outtsv
gvf_files = args.gvf #this is a list of strings

#convert all gvf files to tsv and concatenate them
print("Processing:")
print(gvf_files[0])
tsv_df = gvf2tsv(gvf_files[0])

for gvf in gvf_files[1:]:
print(gvf)
new_tsv_df = gvf2tsv(gvf)
tsv_df = pd.concat([tsv_df, new_tsv_df], ignore_index=True)
tsv_df = pd.concat([tsv_df, new_tsv_df], ignore_index=True)


#find identical rows across strains, and keep only one row.

#change n/a to 0 in 'ao' for counting purposes
tsv_df['ao'] = tsv_df['ao'].str.replace("n/a", "0")

for colname in ['sequence_depth', 'ao', 'ro']:
#split up at commas into new columns: make a new mini-df
split_series = tsv_df[colname].str.split(pat=',').apply(pd.Series)
#rename series columns to 'ao_0', 'a0_1', etc.
split_series.columns = [colname + '_' + str(name) for name in split_series.columns.values]
#ensure all counts are numeric
for column in split_series.columns:
split_series[column] = pd.to_numeric(split_series[column], errors='coerce')
#append series to tsv_df
tsv_df = pd.concat([tsv_df, split_series], axis=1)

cols_to_check = ['name', 'nt_name', 'aa_name', 'multi_aa_name', 'multiaa_comb_mutation', 'start', 'function_category', 'citation', 'comb_mutation', 'function_description', 'heterozygosity']

agg_dict = dict((col,'first') for col in tsv_df.columns.values.tolist())
agg_dict['viral_lineages'] = ', '.join
agg_dict['clade_defining'] = ', '.join

#sum split columns
for string in ['sequence_depth_', 'ao_', 'ro_']:
relevant_keys = [key for key, value in agg_dict.items() if string in key.lower()]
for key in relevant_keys:
agg_dict[key] = 'sum'

final_df = tsv_df.groupby(cols_to_check).agg(agg_dict)

#rejoin split columns (ao, sequence_depth, ro) with comma separation
for string in ['sequence_depth_', 'ao_', 'ro_']:
colnames = [i for i in tsv_df.columns.values.tolist() if string in i]
final_df[string + 'combined'] = final_df[colnames].apply(lambda row: ','.join(row.values.astype(str)), axis=1)
#drop split columns
final_df = final_df.drop(labels=colnames, axis=1)

#replace ao, ro, sequence_depth with the added up columns
final_df = final_df.drop(labels=['ao', 'ro', 'sequence_depth'], axis=1)
final_df = final_df.rename(columns={'sequence_depth_combined': 'sequence_depth', 'ro_combined': 'ro', 'ao_combined': 'ao'})

tsv_df.to_csv(filepath, sep='\t', index=False)
#reorder columns
cols = ['name', 'nt_name', 'aa_name', 'multi_aa_name',
'multiaa_comb_mutation', 'start', 'vcf_gene', 'chrom_region',
'mutation_type', 'sequence_depth', 'ps_filter', 'ps_exc', 'mat_pep_id',
'mat_pep_desc', 'mat_pep_acc', 'ro', 'ao', 'reference_seq',
'variant_seq', 'viral_lineages', 'function_category', 'citation',
'comb_mutation', 'function_description', 'heterozygosity',
'clade_defining', 'who_label', 'variant', 'variant_status',
'voi_designation_date', 'voc_designation_date',
'alert_designation_date']
final_df = final_df[cols]

print("Saved as: " + filepath)
#save the final df to a .tsv
final_df.to_csv(filepath, sep='\t', index=False)
print("")
print("Processing complete.")
print("Saved as: " + filepath)


18 changes: 11 additions & 7 deletions bin/vcf2gvf.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def parse_args():
default=None,
help='.tsv of multi-aa mutation names to split up into individual aa names')
parser.add_argument('--strain', type=str,
default=None,
default='n/a',
help='lineage')
parser.add_argument('--outvcf', type=str,
help='Filename for the output GVF file')
Expand Down Expand Up @@ -128,7 +128,9 @@ def vcftogvf(var_data, strain, GENE_POSITIONS_DICT, names_to_split):
info = df['INFO'].str.split(pat=';').apply(pd.Series) #split at ;, form dataframe
for column in info.columns:
split = info[column].str.split(pat='=').apply(pd.Series)
title = split[0].drop_duplicates().tolist()[0].lower()
title = split[0].drop_duplicates().tolist()[0]
if isinstance(title, str):
title = title.lower()
content = split[1]
info[column] = content #ignore "tag=" in column content
info.rename(columns={column:title}, inplace=True) #make attribute tag as column label
Expand Down Expand Up @@ -285,7 +287,9 @@ def add_functions(gvf, annotation_file, clade_file, strain):
merged_df["#attributes"] = merged_df["#attributes"].astype(str) + "voi_designation_date=" + voi_designation_date + ';'
merged_df["#attributes"] = merged_df["#attributes"].astype(str) + "voc_designation_date=" + voc_designation_date + ';'
merged_df["#attributes"] = merged_df["#attributes"].astype(str) + "alert_designation_date=" + alert_designation_date + ';'

else:
merged_df["#attributes"] = merged_df["#attributes"].astype(str) + "clade_defining=n/a;" + "who_label=n/a;" + "variant=n/a;" + "variant_status=n/a;" + "voi_designation_date=n/a;" + "voc_designation_date=n/a;" + "alert_designation_date=n/a;"


#add ID to attributes
merged_df["#attributes"] = 'ID=' + merged_df['id'].astype(str) + ';' + merged_df["#attributes"].astype(str)
Expand Down Expand Up @@ -326,6 +330,7 @@ def add_functions(gvf, annotation_file, clade_file, strain):

file = args.vcffile


print("Processing: " + file)

#create gvf from annotated vcf (ignoring pragmas for now)
Expand All @@ -352,22 +357,21 @@ def add_functions(gvf, annotation_file, clade_file, strain):
all_strains_mutations.append(mutations)
leftover_df = leftover_df.append(leftover_names)
unmatched_clade_names = unmatched_clade_names.append(leftover_clade_names)

#save unmatched names (in tsv but not in functional_annotations) across all strains to a .tsv file
leftover_names_filepath = args.strain + "_leftover_names.tsv"
leftover_names_filepath = "leftover_names.tsv"
leftover_df.to_csv(leftover_names_filepath, sep='\t', index=False)
print("")
print("Mutation names not found in functional annotations file saved to " + leftover_names_filepath)

#save unmatched clade-defining mutation names to a .tsv file
leftover_clade_names_filepath = args.strain + "_leftover_clade_defining_names.tsv"
leftover_clade_names_filepath = "leftover_clade_defining_names.tsv"
unmatched_clade_names.to_csv(leftover_clade_names_filepath, sep='\t', index=False)
print("Clade-defining mutation names not found in the annotated VCF saved to " + leftover_clade_names_filepath)

#print number of unique mutations across all strains
flattened = [val for sublist in all_strains_mutations for val in sublist]
arr = np.array(flattened)
print("# unique mutations in " + args.strain + " VCF file: ", np.unique(arr).shape[0])
print("# unique mutations in VCF file: ", np.unique(arr).shape[0])

print("")
print("Processing complete.")
Expand Down

0 comments on commit a4433d2

Please sign in to comment.