Skip to content

Commit

Permalink
Annotate delly variants function
Browse files Browse the repository at this point in the history
  • Loading branch information
mhkc committed Mar 19, 2024
1 parent 37db7c4 commit 6518f52
Show file tree
Hide file tree
Showing 3 changed files with 69 additions and 3 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
### Added

- Add optional SNV and structural variants to sample output
- Add function for annotating delly variants intersecting with resistance targets

### Fixed

Expand Down
41 changes: 40 additions & 1 deletion prp/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,10 @@
import logging
from typing import List

import pysam
import click
import pandas as pd
from cyvcf2 import VCF, Writer
from pydantic import TypeAdapter, ValidationError
from pathlib import Path

Expand Down Expand Up @@ -32,6 +34,7 @@
parse_virulencefinder_vir_pred,
load_variants,
)
from .parse.variant import annotate_delly_variants
from .parse.mapping import get_reference_seq_accnr
from .parse.metadata import get_database_info, parse_run_info, get_gb_genome_version
from .parse.utils import get_db_version
Expand All @@ -46,7 +49,7 @@

@click.group()
def cli():
"""Base CLI entrypoint."""
"""Jasen pipeline result processing tool."""


@cli.command()
Expand Down Expand Up @@ -398,3 +401,39 @@ def create_qc_result(sample_id, bam, bed, baits, reference, cpus, output) -> Non
LOG.info("Parse alignment results")
parse_alignment_results(sample_id, bam, reference, cpus, output, bed, baits)
click.secho("Finished generating QC output", fg="green")


@cli.command()
@click.option('-v', '--vcf', type=click.Path(exists=True), help='VCF file')
@click.option('-b', '--bed', type=click.Path(exists=True), help='BED file')
@click.argument('output', type=click.Path(writable=True))
def annotate_delly(vcf, bed, output):
"""Annotate Delly SV varinats with genes in BED file."""
output = Path(output)
# load annotation
if bed is not None:
annotation = pysam.TabixFile(bed, parser=pysam.asTuple())
else:
raise click.UsageError('You must provide a annotation file.')

vcf_obj = VCF(vcf)
variant = next(vcf_obj)
annot_chrom = False
if not variant.CHROM in annotation.contigs:
if len(annotation.contigs) > 1:
raise click.UsageError(
f'"{variant.CHROM}" not in BED file and the file contains {len(annotation.contigs)} chromosomes'
)
else:
annot_chrom = True
LOG.warning(f"Annotating variant chromosome to {annotation.contigs[0]}")
# reset vcf file
vcf_obj = VCF(vcf)
vcf_obj.add_info_to_header({'ID': 'gene', 'Description': 'overlapping gene', 'Type':'Character', 'Number': '1'})
vcf_obj.add_info_to_header({'ID': 'locus_tag', 'Description': 'overlapping tbdb locus tag', 'Type':'Character', 'Number': '1'})

# open vcf writer
writer = Writer(output.absolute(), vcf_obj)
annotate_delly_variants(writer, vcf_obj, annotation, annot_chrom=annot_chrom)

click.secho(f"Wrote annotated delly variants to {output}", fg='green')
30 changes: 28 additions & 2 deletions prp/parse/variant.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""Parse variant from VCF files."""

from cyvcf2 import VCF, Variant
from cyvcf2 import VCF, Variant, Writer
from typing import List
from prp.models.phenotype import VariantBase, VariantType
import logging
Expand Down Expand Up @@ -82,4 +82,30 @@ def load_variants(variant_file: str) -> List[VariantBase]:
for var_id, variant in enumerate(vcf_obj, start=1):
variants.append(parse_variant(variant, var_id=var_id, caller=variant_caller))

return variants
return variants


def annotate_delly_variants(writer, vcf, annotation, annot_chrom=False):
LOCUS_TAG = 3
GENE_SYMBOL = 4
# annotate variant
n_annotated = 0
for variant in vcf:
# update chromosome
if annot_chrom:
variant.CHROM = annotation.contigs[0]
# get genes intersecting with SV
genes = [
{'gene_symbol': gene[GENE_SYMBOL], 'locus_tag': gene[LOCUS_TAG]}
for gene
in annotation.fetch(variant.CHROM, variant.start, variant.end)
]
# add overlapping genes to INFO
if len(genes) > 0:
variant.INFO['gene'] = ','.join([gene['gene_symbol'] for gene in genes])
variant.INFO['locus_tag'] = ','.join([gene['locus_tag'] for gene in genes])
n_annotated += 1

# write variant
writer.write_record(variant)
LOG.info("Annotated %d SV variants", n_annotated)

0 comments on commit 6518f52

Please sign in to comment.