-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* wip * no clinical set before filtervep is working * Reviewer comments
- Loading branch information
Showing
54 changed files
with
1,836 additions
and
283 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,3 @@ | ||
sample,file,family_id,paternal_id,maternal_id,sex,phenotype | ||
sample_1,/path/to/fastq_or_bam/files/sample_1.fastq.gz,FAM,PAT,MAT,0,1 | ||
sample_2,/path/to/fastq_or_bam/files/sample_2.bam,FAM,PAT,MAT,1,1 | ||
sample_1,/path/to/fastq_or_bam/files/sample_1.fastq.gz,FAM,0,0,0,2 | ||
sample_2,/path/to/fastq_or_bam/files/sample_2.bam,FAM,0,0,1,1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,192 @@ | ||
#!/usr/bin/env python3 | ||
|
||
# Written by Ramprasad Neethiraj and released under the MIT license. | ||
# See git repository (https://github.com/nf-core/raredisease) for full license text. | ||
|
||
import argparse | ||
import gzip | ||
import sys | ||
from pathlib import Path | ||
from typing import Tuple, TextIO | ||
|
||
|
||
def parse_vep_csq_transcripts( | ||
transcripts: list, allele_ind: int, csq_ind: int, hgnc_ind: int, var_csq: list | ||
) -> Tuple[list, list, list, list]: | ||
""" | ||
Parse conseqences for each transcript and return HGNC IDs, alleles, and their severity rank | ||
based on the term's ranking in the ensembl consequences list. | ||
Args: | ||
transcripts (list): A list of vep transcript annotation | ||
allele_ind (int) : Index of the "allele" in the vep annotation record | ||
csq_ind (int) : Index of the "Consequence" in the vep annotation record | ||
hgnc_ind (int) : Index of the "HGNC_ID" in the vep annotation record | ||
var_csq (list): A list of consequence terms ordered by rank | ||
Returns: | ||
hgnc_ids (list): list of hgnc ids in the record | ||
alleles (list): list of alleles in the record | ||
consequences (list): list of consequence terms in the record | ||
severity (list): list of consequence term ranks | ||
""" | ||
|
||
consequences = [] | ||
hgnc_ids = [] | ||
severity = [] | ||
alleles = [] | ||
for transcript in transcripts: | ||
vep_fields = transcript.strip().split("|") | ||
csq = vep_fields[csq_ind].split("&")[0] | ||
hgnc_id = vep_fields[hgnc_ind] | ||
allele = vep_fields[allele_ind].replace("CSQ=", "") | ||
consequences.append(csq) | ||
hgnc_ids.append(hgnc_id) | ||
severity.append(var_csq.index(csq)) | ||
alleles.append(allele) | ||
return hgnc_ids, alleles, consequences, severity | ||
|
||
|
||
def construct_most_severe_consequence_info( | ||
line: str, allele_ind: int, csq_ind: int, hgnc_ind: int, var_csq: list | ||
) -> list: | ||
""" | ||
Parse conseqences for each transcript and return HGNC IDs, alleles, and their severity rank | ||
based on the term's ranking in the ensembl consequences list. | ||
Args: | ||
line (str) : Vcf record | ||
allele_ind (int) : Index of the "allele" in the vep annotation record | ||
csq_ind (int) : Index of the "Consequence" in the vep annotation record | ||
hgnc_ind (int) : Index of the "HGNC_ID" in the vep annotation record | ||
var_csq (list): A list of consequence terms ordered by rank | ||
Returns: | ||
columns (list): A list of fields in the vcf record with most severe consequence added | ||
to the INFO column | ||
""" | ||
|
||
columns = line.strip().split() | ||
info_fields = columns[7].split(";") | ||
for field in info_fields: | ||
if field.startswith("CSQ="): | ||
transcripts = field.split("CSQ=")[1].split(",") | ||
hgnc_ids, alleles, consequences, severity = parse_vep_csq_transcripts( | ||
transcripts, allele_ind, csq_ind, hgnc_ind, var_csq | ||
) | ||
unique_ids = list(set(hgnc_ids)) | ||
mscsq_anno = [] | ||
for gene_id in unique_ids: | ||
if gene_id != "": | ||
indices = find_indices(hgnc_ids, gene_id) | ||
alleles_sub = [alleles[i] for i in indices] | ||
consequences_sub = [consequences[i] for i in indices] | ||
severity_sub = [severity[i] for i in indices] | ||
most_severe_csq = consequences_sub[severity_sub.index(min(severity_sub))] | ||
most_severe_allele = alleles_sub[severity_sub.index(min(severity_sub))] | ||
mscsq_anno.append(gene_id + ":" + most_severe_allele + "|" + most_severe_csq) | ||
if mscsq_anno: | ||
columns[7] += ";most_severe_consequence=" + ",".join(mscsq_anno) | ||
return columns | ||
|
||
|
||
def find_indices(list_to_check: list, item_to_find: str) -> list: | ||
""" | ||
Get indices of an element in a list | ||
Args: | ||
list_to_check (list) | ||
item_to_find (value) | ||
Returns: | ||
indices (list) | ||
""" | ||
indices = [] | ||
for idx, value in enumerate(list_to_check): | ||
if value == item_to_find: | ||
indices.append(idx) | ||
return indices | ||
|
||
|
||
def parse_vep_csq_schema(line: str) -> Tuple[int, int, int]: | ||
""" | ||
Get indices of allele, consequence, and hgnc id in the annotation | ||
Args: | ||
line: INFO line in the vcf header with CSQ information | ||
Returns: | ||
allele_ind (int) : Index of the "allele" in the vep annotation record | ||
csq_ind (int) : Index of the "Consequence" in the vep annotation record | ||
hgnc_ind (int) : Index of the "HGNC_ID" in the vep annotation record | ||
""" | ||
fields = line.strip().split("Format: ")[1].replace('">', "").split("|") | ||
allele_ind = fields.index("Allele") | ||
csq_ind = fields.index("Consequence") | ||
hgnc_ind = fields.index("HGNC_ID") | ||
|
||
return allele_ind, csq_ind, hgnc_ind | ||
|
||
|
||
def write_csq_annotated_vcf(file_in: TextIO, file_out: TextIO, var_csq: list): | ||
"""Add most severe consequence field to record, and write the record to a vcf file""" | ||
for line in file_in: | ||
if line.startswith("#"): | ||
file_out.write(line) | ||
if line.startswith("##INFO=<ID=CSQ"): | ||
allele_ind, csq_ind, hgnc_ind = parse_vep_csq_schema(line) | ||
file_out.write( | ||
'##INFO=<ID=most_severe_consequence,Number=.,Type=String,Description="Most severe genomic consequence.">\n' | ||
) | ||
else: | ||
mscsq = construct_most_severe_consequence_info(line, allele_ind, csq_ind, hgnc_ind, var_csq) | ||
file_out.write("\t".join(mscsq) + "\n") | ||
|
||
|
||
def parse_args(argv=None): | ||
"""Define and immediately parse command line arguments.""" | ||
parser = argparse.ArgumentParser( | ||
description="Annotate vcf with the most severe consequence field.", | ||
epilog="Example: python vcfparser.py --file_in vep.vcf --file_out vep.most_severe_csq.vcf --variant_csq variant_consequence.txt", | ||
) | ||
parser.add_argument( | ||
"--file_in", | ||
metavar="FILE_IN", | ||
type=Path, | ||
help="Vcf file annotated with vep.", | ||
) | ||
parser.add_argument( | ||
"--file_out", | ||
metavar="FILE_OUT", | ||
type=Path, | ||
help="Vcf with most_severe_consequence annotations added to it.", | ||
) | ||
parser.add_argument( | ||
"--variant_csq", | ||
metavar="VARIANT_CSQ", | ||
type=Path, | ||
help="Variant consequences ranked by severity", | ||
) | ||
return parser.parse_args(argv) | ||
|
||
|
||
def main(argv=None): | ||
"""Coordinate argument parsing and program execution.""" | ||
args = parse_args(argv) | ||
if not args.file_in.is_file(): | ||
print(f"The given input file {args.file_in} was not found!") | ||
sys.exit(2) | ||
if not args.variant_csq.is_file(): | ||
print(f"The given variant consequence file {args.variant_csq} was not found!") | ||
sys.exit(2) | ||
args.file_out.parent.mkdir(parents=True, exist_ok=True) | ||
with open(args.variant_csq) as f: | ||
var_csq = [line.strip() for line in f] | ||
opener = gzip.open if (args.file_in.suffix == ".gz") else open | ||
with open(args.file_out, "w") as out_vcf: | ||
with opener(args.file_in, "rt") as in_vcf: | ||
write_csq_annotated_vcf(in_vcf, out_vcf, var_csq) | ||
|
||
|
||
if __name__ == "__main__": | ||
sys.exit(main()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,129 @@ | ||
#!/usr/bin/env python3 | ||
|
||
# Written by Ramprasad Neethiraj and released under the MIT license. | ||
# See git repository (https://github.com/nf-core/raredisease) for full license text. | ||
|
||
import argparse | ||
import gzip | ||
import sys | ||
from pathlib import Path | ||
from typing import TextIO | ||
|
||
|
||
def parse_vep_transcripts(transcripts: list, pli_ind: int) -> list: | ||
""" | ||
Parse each transcript and return a list of pli values. | ||
Args: | ||
transcripts (list): A list of vep transcript annotation | ||
pli_ind (int) : Index of pli value in the vep annotation record | ||
Returns: | ||
pli_values (list): list of pli values in the record | ||
""" | ||
|
||
pli_values = [] | ||
for transcript in transcripts: | ||
vep_fields = transcript.strip().split("|") | ||
pli_value = vep_fields[pli_ind] | ||
pli_values.append(pli_value) | ||
return pli_values | ||
|
||
|
||
def construct_most_severe_pli_info(line: str, pli_ind: int) -> list: | ||
""" | ||
Parse gene symbols, find the highest pli value of all gene symbols, add most_severe_pli tag to the info | ||
field and return a list of modified columns | ||
Args: | ||
line (str) : Vcf record | ||
pli_ind (int) : Index of pli value in the vep annotation record | ||
Returns: | ||
columns (list): A list of fields in the vcf record with most severe pli added | ||
to the INFO column | ||
""" | ||
|
||
columns = line.strip().split() | ||
info_fields = columns[7].split(";") | ||
for field in info_fields: | ||
if field.startswith("CSQ="): | ||
transcripts = field.split("CSQ=")[1].split(",") | ||
break | ||
pli_values = parse_vep_transcripts(transcripts, pli_ind) | ||
try: | ||
pli_max = max(pli_values) | ||
except ValueError: | ||
pli_max = "" | ||
if pli_max: | ||
columns[7] += ";most_severe_pli={:.2f}".format(float(pli_max)) | ||
return columns | ||
|
||
|
||
def parse_vep_csq_schema(line: str) -> int: | ||
""" | ||
Get indices of gene symbol in the annotation | ||
Args: | ||
line: INFO line in the vcf header with CSQ information | ||
Returns: | ||
pli_ind (int) : Index of pli value in the vep annotation record | ||
""" | ||
fields = line.strip().split("Format: ")[1].replace('">', "").split("|") | ||
pli_ind = fields.index("pLI_gene_value") | ||
|
||
return pli_ind | ||
|
||
|
||
def write_pli_annotated_vcf(file_in: TextIO, file_out: TextIO): | ||
"""Add most severe pli field to record, and write the record to a vcf file""" | ||
for line in file_in: | ||
if line.startswith("#"): | ||
file_out.write(line) | ||
if line.startswith("##INFO=<ID=CSQ") and "pLI_gene_value" in line: | ||
pli_ind = parse_vep_csq_schema(line) | ||
file_out.write( | ||
'##INFO=<ID=most_severe_pli,Number=1,Type=Float,Description="Probabililty of a gene being loss-of-function intolerant score.">\n' | ||
) | ||
else: | ||
vcf_record = construct_most_severe_pli_info(line, pli_ind) | ||
file_out.write("\t".join(vcf_record) + "\n") | ||
|
||
|
||
def parse_args(argv=None): | ||
"""Define and immediately parse command line arguments.""" | ||
parser = argparse.ArgumentParser( | ||
description="Annotate vcf with the most severe pli field.", | ||
epilog="Example: python vcfparser.py --file_in vep.vcf --file_out vep.most_severe_pli.vcf", | ||
) | ||
parser.add_argument( | ||
"--file_in", | ||
metavar="FILE_IN", | ||
type=Path, | ||
help="Vcf file annotated with vep's pli plugin.", | ||
) | ||
parser.add_argument( | ||
"--file_out", | ||
metavar="FILE_OUT", | ||
type=Path, | ||
help="Vcf with most_severe_pli annotations added to it.", | ||
) | ||
return parser.parse_args(argv) | ||
|
||
|
||
def main(argv=None): | ||
"""Coordinate argument parsing and program execution.""" | ||
args = parse_args(argv) | ||
if not args.file_in.is_file(): | ||
print(f"The given input file {args.file_in} was not found!") | ||
sys.exit(2) | ||
args.file_out.parent.mkdir(parents=True, exist_ok=True) | ||
opener = gzip.open if (args.file_in.suffix == ".gz") else open | ||
with open(args.file_out, "w") as out_vcf: | ||
with opener(args.file_in, "rt") as in_vcf: | ||
write_pli_annotated_vcf(in_vcf, out_vcf) | ||
|
||
|
||
if __name__ == "__main__": | ||
sys.exit(main()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
/* | ||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
Config file for defining DSL2 per module options and publishing paths | ||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
Available keys to override module options: | ||
ext.args = Additional arguments appended to command in module. | ||
ext.args2 = Second set of arguments appended to command in module (multi-tool modules). | ||
ext.args3 = Third set of arguments appended to command in module (multi-tool modules). | ||
ext.prefix = File name prefix for output files. | ||
ext.when = Conditional clause | ||
---------------------------------------------------------------------------------------- | ||
*/ | ||
|
||
process { | ||
withName: '.*:ANN_CSQ_PLI_SNV:.*' { | ||
publishDir = [ | ||
enabled: false | ||
] | ||
} | ||
|
||
withName: '.*ANN_CSQ_PLI_SNV:ADD_MOST_SEVERE_CSQ' { | ||
ext.prefix = { "${meta.id}_snv_csq" } | ||
} | ||
|
||
withName: '.*ANN_CSQ_PLI_SNV:ADD_MOST_SEVERE_PLI' { | ||
ext.prefix = { "${meta.id}_snv_csq_pli" } | ||
} | ||
|
||
withName: '.*ANN_CSQ_PLI_SNV:TABIX_BGZIPTABIX' { | ||
ext.prefix = { "${meta.id}_snv_csq_pli" } | ||
} | ||
} |
Oops, something went wrong.