Add rank_variants (#255)

* wip * no clinical set before filtervep is working * Reviewer comments
genomic-medicine-sweden · Jul 18, 2024 · 55c55c5 · 55c55c5
1 parent f580e97
commit 55c55c5
Show file tree

Hide file tree

Showing 54 changed files with 1,836 additions and 283 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -14,6 +14,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - [#243](https://github.com/genomic-medicine-sweden/nallo/pull/243) - Added nf-test to the short variant annotation workflow
 - [#245](https://github.com/genomic-medicine-sweden/nallo/pull/245) - Added repeat annotation with Stranger
 - [#252](https://github.com/genomic-medicine-sweden/nallo/pull/252) - Added a new `SCATTER_GENOME` subworkflow
+- [#255](https://github.com/genomic-medicine-sweden/nallo/pull/255) - Added a new `RANK_VARIANTS` subworkflow to rank SNVs using genmod
 
 ### `Changed`
 

diff --git a/README.md b/README.md
@@ -44,9 +44,9 @@
 1. Annotate variants with database(s) of choice, i.e. [gnomAD](https://gnomad.broadinstitute.org), [CADD](https://cadd.gs.washington.edu) etc. ([`echtvar`](https://github.com/brentp/echtvar))
 2. Annotate variants ([`VEP`](https://github.com/Ensembl/ensembl-vep))
 
-##### Filtering
+##### Filtering and ranking
 
-- TBD
+- Rank variants [GENMOD](https://github.com/Clinical-Genomics/genmod)
 
 ## Usage
 

diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv
@@ -1,3 +1,3 @@
 sample,file,family_id,paternal_id,maternal_id,sex,phenotype
-sample_1,/path/to/fastq_or_bam/files/sample_1.fastq.gz,FAM,PAT,MAT,0,1
-sample_2,/path/to/fastq_or_bam/files/sample_2.bam,FAM,PAT,MAT,1,1
+sample_1,/path/to/fastq_or_bam/files/sample_1.fastq.gz,FAM,0,0,0,2
+sample_2,/path/to/fastq_or_bam/files/sample_2.bam,FAM,0,0,1,1
diff --git a/assets/schema_input.json b/assets/schema_input.json
@@ -28,13 +28,13 @@
             "paternal_id": {
                 "type": "string",
                 "pattern": "^\\S+$",
-                "errorMessage": "Paternal ID must be provided and cannot contain spaces. If no paternal ID is available, use any ID not in sample column.",
+                "errorMessage": "Paternal ID must be provided and cannot contain spaces. If no paternal ID is available, use 0.",
                 "meta": ["paternal_id"]
             },
             "maternal_id": {
                 "type": "string",
                 "pattern": "^\\S+$",
-                "errorMessage": "Maternal ID must be provided and cannot contain spaces. If no maternal ID is available, use any ID not in sample column.",
+                "errorMessage": "Maternal ID must be provided and cannot contain spaces. If no maternal ID is available, use 0.",
                 "meta": ["maternal_id"]
             },
             "sex": {

diff --git a/bin/add_most_severe_consequence.py b/bin/add_most_severe_consequence.py
@@ -0,0 +1,192 @@
+#!/usr/bin/env python3
+
+# Written by Ramprasad Neethiraj and released under the MIT license.
+# See git repository (https://github.com/nf-core/raredisease) for full license text.
+
+import argparse
+import gzip
+import sys
+from pathlib import Path
+from typing import Tuple, TextIO
+
+
+def parse_vep_csq_transcripts(
+    transcripts: list, allele_ind: int, csq_ind: int, hgnc_ind: int, var_csq: list
+) -> Tuple[list, list, list, list]:
+    """
+    Parse conseqences for each transcript and return HGNC IDs, alleles, and their severity rank
+    based on the term's ranking in the ensembl consequences list.
+
+    Args:
+        transcripts (list): A list of vep transcript annotation
+        allele_ind  (int) : Index of the "allele" in the vep annotation record
+        csq_ind     (int) : Index of the "Consequence" in the vep annotation record
+        hgnc_ind    (int) : Index of the "HGNC_ID" in the vep annotation record
+        var_csq     (list): A list of consequence terms ordered by rank
+
+    Returns:
+        hgnc_ids     (list): list of hgnc ids in the record
+        alleles      (list): list of alleles in the record
+        consequences (list): list of consequence terms in the record
+        severity     (list): list of consequence term ranks
+    """
+
+    consequences = []
+    hgnc_ids = []
+    severity = []
+    alleles = []
+    for transcript in transcripts:
+        vep_fields = transcript.strip().split("|")
+        csq = vep_fields[csq_ind].split("&")[0]
+        hgnc_id = vep_fields[hgnc_ind]
+        allele = vep_fields[allele_ind].replace("CSQ=", "")
+        consequences.append(csq)
+        hgnc_ids.append(hgnc_id)
+        severity.append(var_csq.index(csq))
+        alleles.append(allele)
+    return hgnc_ids, alleles, consequences, severity
+
+
+def construct_most_severe_consequence_info(
+    line: str, allele_ind: int, csq_ind: int, hgnc_ind: int, var_csq: list
+) -> list:
+    """
+    Parse conseqences for each transcript and return HGNC IDs, alleles, and their severity rank
+    based on the term's ranking in the ensembl consequences list.
+
+    Args:
+        line        (str) : Vcf record
+        allele_ind  (int) : Index of the "allele" in the vep annotation record
+        csq_ind     (int) : Index of the "Consequence" in the vep annotation record
+        hgnc_ind    (int) : Index of the "HGNC_ID" in the vep annotation record
+        var_csq     (list): A list of consequence terms ordered by rank
+
+    Returns:
+        columns     (list): A list of fields in the vcf record with most severe consequence added
+                            to the INFO column
+    """
+
+    columns = line.strip().split()
+    info_fields = columns[7].split(";")
+    for field in info_fields:
+        if field.startswith("CSQ="):
+            transcripts = field.split("CSQ=")[1].split(",")
+    hgnc_ids, alleles, consequences, severity = parse_vep_csq_transcripts(
+        transcripts, allele_ind, csq_ind, hgnc_ind, var_csq
+    )
+    unique_ids = list(set(hgnc_ids))
+    mscsq_anno = []
+    for gene_id in unique_ids:
+        if gene_id != "":
+            indices = find_indices(hgnc_ids, gene_id)
+            alleles_sub = [alleles[i] for i in indices]
+            consequences_sub = [consequences[i] for i in indices]
+            severity_sub = [severity[i] for i in indices]
+            most_severe_csq = consequences_sub[severity_sub.index(min(severity_sub))]
+            most_severe_allele = alleles_sub[severity_sub.index(min(severity_sub))]
+            mscsq_anno.append(gene_id + ":" + most_severe_allele + "|" + most_severe_csq)
+    if mscsq_anno:
+        columns[7] += ";most_severe_consequence=" + ",".join(mscsq_anno)
+    return columns
+
+
+def find_indices(list_to_check: list, item_to_find: str) -> list:
+    """
+    Get indices of an element in a list
+
+    Args:
+        list_to_check (list)
+        item_to_find  (value)
+
+    Returns:
+        indices       (list)
+    """
+    indices = []
+    for idx, value in enumerate(list_to_check):
+        if value == item_to_find:
+            indices.append(idx)
+    return indices
+
+
+def parse_vep_csq_schema(line: str) -> Tuple[int, int, int]:
+    """
+    Get indices of allele, consequence, and hgnc id in the annotation
+
+    Args:
+        line: INFO line in the vcf header with CSQ information
+
+    Returns:
+        allele_ind  (int) : Index of the "allele" in the vep annotation record
+        csq_ind     (int) : Index of the "Consequence" in the vep annotation record
+        hgnc_ind    (int) : Index of the "HGNC_ID" in the vep annotation record
+    """
+    fields = line.strip().split("Format: ")[1].replace('">', "").split("|")
+    allele_ind = fields.index("Allele")
+    csq_ind = fields.index("Consequence")
+    hgnc_ind = fields.index("HGNC_ID")
+
+    return allele_ind, csq_ind, hgnc_ind
+
+
+def write_csq_annotated_vcf(file_in: TextIO, file_out: TextIO, var_csq: list):
+    """Add most severe consequence field to record, and write the record to a vcf file"""
+    for line in file_in:
+        if line.startswith("#"):
+            file_out.write(line)
+            if line.startswith("##INFO=<ID=CSQ"):
+                allele_ind, csq_ind, hgnc_ind = parse_vep_csq_schema(line)
+                file_out.write(
+                    '##INFO=<ID=most_severe_consequence,Number=.,Type=String,Description="Most severe genomic consequence.">\n'
+                )
+        else:
+            mscsq = construct_most_severe_consequence_info(line, allele_ind, csq_ind, hgnc_ind, var_csq)
+            file_out.write("\t".join(mscsq) + "\n")
+
+
+def parse_args(argv=None):
+    """Define and immediately parse command line arguments."""
+    parser = argparse.ArgumentParser(
+        description="Annotate vcf with the most severe consequence field.",
+        epilog="Example: python vcfparser.py --file_in vep.vcf --file_out vep.most_severe_csq.vcf --variant_csq variant_consequence.txt",
+    )
+    parser.add_argument(
+        "--file_in",
+        metavar="FILE_IN",
+        type=Path,
+        help="Vcf file annotated with vep.",
+    )
+    parser.add_argument(
+        "--file_out",
+        metavar="FILE_OUT",
+        type=Path,
+        help="Vcf with most_severe_consequence annotations added to it.",
+    )
+    parser.add_argument(
+        "--variant_csq",
+        metavar="VARIANT_CSQ",
+        type=Path,
+        help="Variant consequences ranked by severity",
+    )
+    return parser.parse_args(argv)
+
+
+def main(argv=None):
+    """Coordinate argument parsing and program execution."""
+    args = parse_args(argv)
+    if not args.file_in.is_file():
+        print(f"The given input file {args.file_in} was not found!")
+        sys.exit(2)
+    if not args.variant_csq.is_file():
+        print(f"The given variant consequence file {args.variant_csq} was not found!")
+        sys.exit(2)
+    args.file_out.parent.mkdir(parents=True, exist_ok=True)
+    with open(args.variant_csq) as f:
+        var_csq = [line.strip() for line in f]
+    opener = gzip.open if (args.file_in.suffix == ".gz") else open
+    with open(args.file_out, "w") as out_vcf:
+        with opener(args.file_in, "rt") as in_vcf:
+            write_csq_annotated_vcf(in_vcf, out_vcf, var_csq)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/bin/add_most_severe_pli.py b/bin/add_most_severe_pli.py
@@ -0,0 +1,129 @@
+#!/usr/bin/env python3
+
+# Written by Ramprasad Neethiraj and released under the MIT license.
+# See git repository (https://github.com/nf-core/raredisease) for full license text.
+
+import argparse
+import gzip
+import sys
+from pathlib import Path
+from typing import TextIO
+
+
+def parse_vep_transcripts(transcripts: list, pli_ind: int) -> list:
+    """
+    Parse each transcript and return a list of pli values.
+
+    Args:
+        transcripts (list): A list of vep transcript annotation
+        pli_ind     (int) : Index of pli value in the vep annotation record
+
+    Returns:
+        pli_values  (list): list of pli values in the record
+    """
+
+    pli_values = []
+    for transcript in transcripts:
+        vep_fields = transcript.strip().split("|")
+        pli_value = vep_fields[pli_ind]
+        pli_values.append(pli_value)
+    return pli_values
+
+
+def construct_most_severe_pli_info(line: str, pli_ind: int) -> list:
+    """
+    Parse gene symbols, find the highest pli value of all gene symbols, add most_severe_pli tag to the info
+    field and return a list of modified columns
+
+    Args:
+        line        (str) : Vcf record
+        pli_ind     (int) : Index of pli value in the vep annotation record
+
+    Returns:
+        columns     (list): A list of fields in the vcf record with most severe pli added
+                            to the INFO column
+    """
+
+    columns = line.strip().split()
+    info_fields = columns[7].split(";")
+    for field in info_fields:
+        if field.startswith("CSQ="):
+            transcripts = field.split("CSQ=")[1].split(",")
+            break
+    pli_values = parse_vep_transcripts(transcripts, pli_ind)
+    try:
+        pli_max = max(pli_values)
+    except ValueError:
+        pli_max = ""
+    if pli_max:
+        columns[7] += ";most_severe_pli={:.2f}".format(float(pli_max))
+    return columns
+
+
+def parse_vep_csq_schema(line: str) -> int:
+    """
+    Get indices of gene symbol in the annotation
+
+    Args:
+        line: INFO line in the vcf header with CSQ information
+
+    Returns:
+        pli_ind  (int) : Index of pli value in the vep annotation record
+    """
+    fields = line.strip().split("Format: ")[1].replace('">', "").split("|")
+    pli_ind = fields.index("pLI_gene_value")
+
+    return pli_ind
+
+
+def write_pli_annotated_vcf(file_in: TextIO, file_out: TextIO):
+    """Add most severe pli field to record, and write the record to a vcf file"""
+    for line in file_in:
+        if line.startswith("#"):
+            file_out.write(line)
+            if line.startswith("##INFO=<ID=CSQ") and "pLI_gene_value" in line:
+                pli_ind = parse_vep_csq_schema(line)
+                file_out.write(
+                    '##INFO=<ID=most_severe_pli,Number=1,Type=Float,Description="Probabililty of a gene being loss-of-function intolerant score.">\n'
+                )
+        else:
+            vcf_record = construct_most_severe_pli_info(line, pli_ind)
+            file_out.write("\t".join(vcf_record) + "\n")
+
+
+def parse_args(argv=None):
+    """Define and immediately parse command line arguments."""
+    parser = argparse.ArgumentParser(
+        description="Annotate vcf with the most severe pli field.",
+        epilog="Example: python vcfparser.py --file_in vep.vcf --file_out vep.most_severe_pli.vcf",
+    )
+    parser.add_argument(
+        "--file_in",
+        metavar="FILE_IN",
+        type=Path,
+        help="Vcf file annotated with vep's pli plugin.",
+    )
+    parser.add_argument(
+        "--file_out",
+        metavar="FILE_OUT",
+        type=Path,
+        help="Vcf with most_severe_pli annotations added to it.",
+    )
+    return parser.parse_args(argv)
+
+
+def main(argv=None):
+    """Coordinate argument parsing and program execution."""
+    args = parse_args(argv)
+    if not args.file_in.is_file():
+        print(f"The given input file {args.file_in} was not found!")
+        sys.exit(2)
+    args.file_out.parent.mkdir(parents=True, exist_ok=True)
+    opener = gzip.open if (args.file_in.suffix == ".gz") else open
+    with open(args.file_out, "w") as out_vcf:
+        with opener(args.file_in, "rt") as in_vcf:
+            write_pli_annotated_vcf(in_vcf, out_vcf)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/conf/modules/annotate_consequence_pli.config b/conf/modules/annotate_consequence_pli.config
@@ -0,0 +1,32 @@
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Config file for defining DSL2 per module options and publishing paths
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Available keys to override module options:
+        ext.args            = Additional arguments appended to command in module.
+        ext.args2           = Second set of arguments appended to command in module (multi-tool modules).
+        ext.args3           = Third set of arguments appended to command in module (multi-tool modules).
+        ext.prefix          = File name prefix for output files.
+        ext.when            = Conditional clause
+----------------------------------------------------------------------------------------
+*/
+
+process {
+    withName: '.*:ANN_CSQ_PLI_SNV:.*' {
+        publishDir = [
+            enabled: false
+        ]
+    }
+
+    withName: '.*ANN_CSQ_PLI_SNV:ADD_MOST_SEVERE_CSQ' {
+        ext.prefix = { "${meta.id}_snv_csq" }
+    }
+
+    withName: '.*ANN_CSQ_PLI_SNV:ADD_MOST_SEVERE_PLI' {
+        ext.prefix = { "${meta.id}_snv_csq_pli" }
+    }
+
+    withName: '.*ANN_CSQ_PLI_SNV:TABIX_BGZIPTABIX' {
+        ext.prefix = { "${meta.id}_snv_csq_pli" }
+    }
+}