Skip to content

Commit

Permalink
Add rank_variants (#255)
Browse files Browse the repository at this point in the history
* wip

* no clinical set before filtervep is working

* Reviewer comments
  • Loading branch information
fellen31 authored Jul 18, 2024
1 parent f580e97 commit 55c55c5
Show file tree
Hide file tree
Showing 54 changed files with 1,836 additions and 283 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- [#243](https://github.com/genomic-medicine-sweden/nallo/pull/243) - Added nf-test to the short variant annotation workflow
- [#245](https://github.com/genomic-medicine-sweden/nallo/pull/245) - Added repeat annotation with Stranger
- [#252](https://github.com/genomic-medicine-sweden/nallo/pull/252) - Added a new `SCATTER_GENOME` subworkflow
- [#255](https://github.com/genomic-medicine-sweden/nallo/pull/255) - Added a new `RANK_VARIANTS` subworkflow to rank SNVs using genmod

### `Changed`

Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,9 @@
1. Annotate variants with database(s) of choice, i.e. [gnomAD](https://gnomad.broadinstitute.org), [CADD](https://cadd.gs.washington.edu) etc. ([`echtvar`](https://github.com/brentp/echtvar))
2. Annotate variants ([`VEP`](https://github.com/Ensembl/ensembl-vep))

##### Filtering
##### Filtering and ranking

- TBD
- Rank variants [GENMOD](https://github.com/Clinical-Genomics/genmod)

## Usage

Expand Down
4 changes: 2 additions & 2 deletions assets/samplesheet.csv
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
sample,file,family_id,paternal_id,maternal_id,sex,phenotype
sample_1,/path/to/fastq_or_bam/files/sample_1.fastq.gz,FAM,PAT,MAT,0,1
sample_2,/path/to/fastq_or_bam/files/sample_2.bam,FAM,PAT,MAT,1,1
sample_1,/path/to/fastq_or_bam/files/sample_1.fastq.gz,FAM,0,0,0,2
sample_2,/path/to/fastq_or_bam/files/sample_2.bam,FAM,0,0,1,1
4 changes: 2 additions & 2 deletions assets/schema_input.json
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,13 @@
"paternal_id": {
"type": "string",
"pattern": "^\\S+$",
"errorMessage": "Paternal ID must be provided and cannot contain spaces. If no paternal ID is available, use any ID not in sample column.",
"errorMessage": "Paternal ID must be provided and cannot contain spaces. If no paternal ID is available, use 0.",
"meta": ["paternal_id"]
},
"maternal_id": {
"type": "string",
"pattern": "^\\S+$",
"errorMessage": "Maternal ID must be provided and cannot contain spaces. If no maternal ID is available, use any ID not in sample column.",
"errorMessage": "Maternal ID must be provided and cannot contain spaces. If no maternal ID is available, use 0.",
"meta": ["maternal_id"]
},
"sex": {
Expand Down
192 changes: 192 additions & 0 deletions bin/add_most_severe_consequence.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
#!/usr/bin/env python3

# Written by Ramprasad Neethiraj and released under the MIT license.
# See git repository (https://github.com/nf-core/raredisease) for full license text.

import argparse
import gzip
import sys
from pathlib import Path
from typing import Tuple, TextIO


def parse_vep_csq_transcripts(
transcripts: list, allele_ind: int, csq_ind: int, hgnc_ind: int, var_csq: list
) -> Tuple[list, list, list, list]:
"""
Parse conseqences for each transcript and return HGNC IDs, alleles, and their severity rank
based on the term's ranking in the ensembl consequences list.
Args:
transcripts (list): A list of vep transcript annotation
allele_ind (int) : Index of the "allele" in the vep annotation record
csq_ind (int) : Index of the "Consequence" in the vep annotation record
hgnc_ind (int) : Index of the "HGNC_ID" in the vep annotation record
var_csq (list): A list of consequence terms ordered by rank
Returns:
hgnc_ids (list): list of hgnc ids in the record
alleles (list): list of alleles in the record
consequences (list): list of consequence terms in the record
severity (list): list of consequence term ranks
"""

consequences = []
hgnc_ids = []
severity = []
alleles = []
for transcript in transcripts:
vep_fields = transcript.strip().split("|")
csq = vep_fields[csq_ind].split("&")[0]
hgnc_id = vep_fields[hgnc_ind]
allele = vep_fields[allele_ind].replace("CSQ=", "")
consequences.append(csq)
hgnc_ids.append(hgnc_id)
severity.append(var_csq.index(csq))
alleles.append(allele)
return hgnc_ids, alleles, consequences, severity


def construct_most_severe_consequence_info(
line: str, allele_ind: int, csq_ind: int, hgnc_ind: int, var_csq: list
) -> list:
"""
Parse conseqences for each transcript and return HGNC IDs, alleles, and their severity rank
based on the term's ranking in the ensembl consequences list.
Args:
line (str) : Vcf record
allele_ind (int) : Index of the "allele" in the vep annotation record
csq_ind (int) : Index of the "Consequence" in the vep annotation record
hgnc_ind (int) : Index of the "HGNC_ID" in the vep annotation record
var_csq (list): A list of consequence terms ordered by rank
Returns:
columns (list): A list of fields in the vcf record with most severe consequence added
to the INFO column
"""

columns = line.strip().split()
info_fields = columns[7].split(";")
for field in info_fields:
if field.startswith("CSQ="):
transcripts = field.split("CSQ=")[1].split(",")
hgnc_ids, alleles, consequences, severity = parse_vep_csq_transcripts(
transcripts, allele_ind, csq_ind, hgnc_ind, var_csq
)
unique_ids = list(set(hgnc_ids))
mscsq_anno = []
for gene_id in unique_ids:
if gene_id != "":
indices = find_indices(hgnc_ids, gene_id)
alleles_sub = [alleles[i] for i in indices]
consequences_sub = [consequences[i] for i in indices]
severity_sub = [severity[i] for i in indices]
most_severe_csq = consequences_sub[severity_sub.index(min(severity_sub))]
most_severe_allele = alleles_sub[severity_sub.index(min(severity_sub))]
mscsq_anno.append(gene_id + ":" + most_severe_allele + "|" + most_severe_csq)
if mscsq_anno:
columns[7] += ";most_severe_consequence=" + ",".join(mscsq_anno)
return columns


def find_indices(list_to_check: list, item_to_find: str) -> list:
"""
Get indices of an element in a list
Args:
list_to_check (list)
item_to_find (value)
Returns:
indices (list)
"""
indices = []
for idx, value in enumerate(list_to_check):
if value == item_to_find:
indices.append(idx)
return indices


def parse_vep_csq_schema(line: str) -> Tuple[int, int, int]:
"""
Get indices of allele, consequence, and hgnc id in the annotation
Args:
line: INFO line in the vcf header with CSQ information
Returns:
allele_ind (int) : Index of the "allele" in the vep annotation record
csq_ind (int) : Index of the "Consequence" in the vep annotation record
hgnc_ind (int) : Index of the "HGNC_ID" in the vep annotation record
"""
fields = line.strip().split("Format: ")[1].replace('">', "").split("|")
allele_ind = fields.index("Allele")
csq_ind = fields.index("Consequence")
hgnc_ind = fields.index("HGNC_ID")

return allele_ind, csq_ind, hgnc_ind


def write_csq_annotated_vcf(file_in: TextIO, file_out: TextIO, var_csq: list):
"""Add most severe consequence field to record, and write the record to a vcf file"""
for line in file_in:
if line.startswith("#"):
file_out.write(line)
if line.startswith("##INFO=<ID=CSQ"):
allele_ind, csq_ind, hgnc_ind = parse_vep_csq_schema(line)
file_out.write(
'##INFO=<ID=most_severe_consequence,Number=.,Type=String,Description="Most severe genomic consequence.">\n'
)
else:
mscsq = construct_most_severe_consequence_info(line, allele_ind, csq_ind, hgnc_ind, var_csq)
file_out.write("\t".join(mscsq) + "\n")


def parse_args(argv=None):
"""Define and immediately parse command line arguments."""
parser = argparse.ArgumentParser(
description="Annotate vcf with the most severe consequence field.",
epilog="Example: python vcfparser.py --file_in vep.vcf --file_out vep.most_severe_csq.vcf --variant_csq variant_consequence.txt",
)
parser.add_argument(
"--file_in",
metavar="FILE_IN",
type=Path,
help="Vcf file annotated with vep.",
)
parser.add_argument(
"--file_out",
metavar="FILE_OUT",
type=Path,
help="Vcf with most_severe_consequence annotations added to it.",
)
parser.add_argument(
"--variant_csq",
metavar="VARIANT_CSQ",
type=Path,
help="Variant consequences ranked by severity",
)
return parser.parse_args(argv)


def main(argv=None):
"""Coordinate argument parsing and program execution."""
args = parse_args(argv)
if not args.file_in.is_file():
print(f"The given input file {args.file_in} was not found!")
sys.exit(2)
if not args.variant_csq.is_file():
print(f"The given variant consequence file {args.variant_csq} was not found!")
sys.exit(2)
args.file_out.parent.mkdir(parents=True, exist_ok=True)
with open(args.variant_csq) as f:
var_csq = [line.strip() for line in f]
opener = gzip.open if (args.file_in.suffix == ".gz") else open
with open(args.file_out, "w") as out_vcf:
with opener(args.file_in, "rt") as in_vcf:
write_csq_annotated_vcf(in_vcf, out_vcf, var_csq)


if __name__ == "__main__":
sys.exit(main())
129 changes: 129 additions & 0 deletions bin/add_most_severe_pli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
#!/usr/bin/env python3

# Written by Ramprasad Neethiraj and released under the MIT license.
# See git repository (https://github.com/nf-core/raredisease) for full license text.

import argparse
import gzip
import sys
from pathlib import Path
from typing import TextIO


def parse_vep_transcripts(transcripts: list, pli_ind: int) -> list:
"""
Parse each transcript and return a list of pli values.
Args:
transcripts (list): A list of vep transcript annotation
pli_ind (int) : Index of pli value in the vep annotation record
Returns:
pli_values (list): list of pli values in the record
"""

pli_values = []
for transcript in transcripts:
vep_fields = transcript.strip().split("|")
pli_value = vep_fields[pli_ind]
pli_values.append(pli_value)
return pli_values


def construct_most_severe_pli_info(line: str, pli_ind: int) -> list:
"""
Parse gene symbols, find the highest pli value of all gene symbols, add most_severe_pli tag to the info
field and return a list of modified columns
Args:
line (str) : Vcf record
pli_ind (int) : Index of pli value in the vep annotation record
Returns:
columns (list): A list of fields in the vcf record with most severe pli added
to the INFO column
"""

columns = line.strip().split()
info_fields = columns[7].split(";")
for field in info_fields:
if field.startswith("CSQ="):
transcripts = field.split("CSQ=")[1].split(",")
break
pli_values = parse_vep_transcripts(transcripts, pli_ind)
try:
pli_max = max(pli_values)
except ValueError:
pli_max = ""
if pli_max:
columns[7] += ";most_severe_pli={:.2f}".format(float(pli_max))
return columns


def parse_vep_csq_schema(line: str) -> int:
"""
Get indices of gene symbol in the annotation
Args:
line: INFO line in the vcf header with CSQ information
Returns:
pli_ind (int) : Index of pli value in the vep annotation record
"""
fields = line.strip().split("Format: ")[1].replace('">', "").split("|")
pli_ind = fields.index("pLI_gene_value")

return pli_ind


def write_pli_annotated_vcf(file_in: TextIO, file_out: TextIO):
"""Add most severe pli field to record, and write the record to a vcf file"""
for line in file_in:
if line.startswith("#"):
file_out.write(line)
if line.startswith("##INFO=<ID=CSQ") and "pLI_gene_value" in line:
pli_ind = parse_vep_csq_schema(line)
file_out.write(
'##INFO=<ID=most_severe_pli,Number=1,Type=Float,Description="Probabililty of a gene being loss-of-function intolerant score.">\n'
)
else:
vcf_record = construct_most_severe_pli_info(line, pli_ind)
file_out.write("\t".join(vcf_record) + "\n")


def parse_args(argv=None):
"""Define and immediately parse command line arguments."""
parser = argparse.ArgumentParser(
description="Annotate vcf with the most severe pli field.",
epilog="Example: python vcfparser.py --file_in vep.vcf --file_out vep.most_severe_pli.vcf",
)
parser.add_argument(
"--file_in",
metavar="FILE_IN",
type=Path,
help="Vcf file annotated with vep's pli plugin.",
)
parser.add_argument(
"--file_out",
metavar="FILE_OUT",
type=Path,
help="Vcf with most_severe_pli annotations added to it.",
)
return parser.parse_args(argv)


def main(argv=None):
"""Coordinate argument parsing and program execution."""
args = parse_args(argv)
if not args.file_in.is_file():
print(f"The given input file {args.file_in} was not found!")
sys.exit(2)
args.file_out.parent.mkdir(parents=True, exist_ok=True)
opener = gzip.open if (args.file_in.suffix == ".gz") else open
with open(args.file_out, "w") as out_vcf:
with opener(args.file_in, "rt") as in_vcf:
write_pli_annotated_vcf(in_vcf, out_vcf)


if __name__ == "__main__":
sys.exit(main())
32 changes: 32 additions & 0 deletions conf/modules/annotate_consequence_pli.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Config file for defining DSL2 per module options and publishing paths
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Available keys to override module options:
ext.args = Additional arguments appended to command in module.
ext.args2 = Second set of arguments appended to command in module (multi-tool modules).
ext.args3 = Third set of arguments appended to command in module (multi-tool modules).
ext.prefix = File name prefix for output files.
ext.when = Conditional clause
----------------------------------------------------------------------------------------
*/

process {
withName: '.*:ANN_CSQ_PLI_SNV:.*' {
publishDir = [
enabled: false
]
}

withName: '.*ANN_CSQ_PLI_SNV:ADD_MOST_SEVERE_CSQ' {
ext.prefix = { "${meta.id}_snv_csq" }
}

withName: '.*ANN_CSQ_PLI_SNV:ADD_MOST_SEVERE_PLI' {
ext.prefix = { "${meta.id}_snv_csq_pli" }
}

withName: '.*ANN_CSQ_PLI_SNV:TABIX_BGZIPTABIX' {
ext.prefix = { "${meta.id}_snv_csq_pli" }
}
}
Loading

0 comments on commit 55c55c5

Please sign in to comment.