Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add emmtyper, update shigapass and fix quast #92

Merged
merged 9 commits into from
Oct 10, 2024
9 changes: 8 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,24 @@

### Added

- Added emmtyper and parser
- Added pytests for emmtyper

### Fixed

### Changed

- Changed Shigapass models to be consistent with other typing models
- Changed Shigapass parsers to be consistent with other typing parsers
- Changed ref genome related variables to be optional in quast

## [0.10.1]

### Added

### Fixed

- Updated parsing of ChewBBACA allele calling annotations and novel alleles. This adds support for annotations introduced in v3.
- Updated parsing of ChewBBACA allele calling annotations and novel alleles. This adds support for annotations introduced in v3.

### Changed

Expand Down
10 changes: 10 additions & 0 deletions prp/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
parse_amrfinder_amr_pred,
parse_amrfinder_vir_pred,
parse_cgmlst_results,
parse_emmtyper_pred,
parse_kraken_result,
parse_mlst_results,
parse_mykrobe_amr_pred,
Expand Down Expand Up @@ -116,6 +117,7 @@ def cli(silent, debug):
)
@click.option("-p", "--quality", type=click.Path(), help="postalignqc qc results")
@click.option("-k", "--mykrobe", type=click.Path(), help="mykrobe results")
@click.option("-e", "--emmtyper", type=click.Path(), help="Emmtyper m-type prediction results")
@click.option("-g", "--shigapass", type=click.Path(), help="shigapass results")
@click.option("-t", "--tbprofiler", type=click.Path(), help="tbprofiler results")
@click.option("--bam", type=click.Path(), help="Read mapping to reference genome")
Expand Down Expand Up @@ -153,6 +155,7 @@ def create_bonsai_input(
serotypefinder,
quality,
mykrobe,
emmtyper,
shigapass,
tbprofiler,
bam,
Expand Down Expand Up @@ -246,6 +249,13 @@ def create_bonsai_input(
if res is not None:
results["typing_result"].extend(res)

if emmtyper:
LOG.info("Parse emmtyper results")
# Emmtyping
res: MethodIndex | None = parse_emmtyper_pred(emmtyper)
if res is not None:
results["typing_result"].extend(res)

if shigapass:
LOG.info("Parse shigapass results")
# Shigatyping
Expand Down
2 changes: 1 addition & 1 deletion prp/models/phenotype.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ class GeneBase(BaseModel):
default=None, description="Reference sequence name"
)
element_type: ElementType = Field(
description="The predominant function fo the gene."
description="The predominant function of the gene."
)
element_subtype: Union[
ElementStressSubtype,
Expand Down
6 changes: 3 additions & 3 deletions prp/models/qc.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,13 @@ class QuastQcResult(BaseModel):
"""Assembly QC metrics."""

total_length: int
reference_length: int
reference_length: int | None = None
largest_contig: int
n_contigs: int
n50: int
assembly_gc: float
reference_gc: float
duplication_ratio: float
reference_gc: float | None = None
duplication_ratio: float | None = None


class PostAlignQcResult(BaseModel):
Expand Down
3 changes: 2 additions & 1 deletion prp/models/sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from .typing import (
ResultLineageBase,
ShigaTypingMethodIndex,
EmmTypingMethodIndex,
TbProfilerLineage,
TypingMethod,
TypingResultCgMlst,
Expand Down Expand Up @@ -80,7 +81,7 @@ class PipelineResult(SampleBase):

schema_version: Literal[1] = 1
# optional typing
typing_result: list[Union[ShigaTypingMethodIndex, MethodIndex]] = Field(
typing_result: list[Union[ShigaTypingMethodIndex, EmmTypingMethodIndex, MethodIndex]] = Field(
..., alias="typingResult"
)
# optional phenotype prediction
Expand Down
19 changes: 19 additions & 0 deletions prp/models/typing.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ class TypingSoftware(str, Enum):
VIRULENCEFINDER = "virulencefinder"
SEROTYPEFINDER = "serotypefinder"
SHIGAPASS = "shigapass"
EMMTYPER = "emmtyper"


class TypingMethod(str, Enum):
Expand All @@ -31,6 +32,7 @@ class TypingMethod(str, Enum):
OTYPE = "O_type"
HTYPE = "H_type"
SHIGATYPE = "shigatype"
EMMTYPE = "emmtype"


class ChewbbacaErrors(str, Enum):
Expand Down Expand Up @@ -97,6 +99,23 @@ class ShigaTypingMethodIndex(RWModel):
result: TypingResultShiga


class TypingResultEmm(RWModel):
"""Container for emmtype gene information"""

cluster_count: int
emmtype: str
emm_like_alleles: list[str]
emm_cluster: str


class EmmTypingMethodIndex(RWModel):
"""Method Index Emm."""

type: Literal[TypingMethod.EMMTYPE]
software: Literal[TypingSoftware.EMMTYPER]
result: TypingResultEmm


class ResultLineageBase(RWModel):
"""Lineage results"""

Expand Down
1 change: 1 addition & 0 deletions prp/parse/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from .phenotype import (
parse_amrfinder_amr_pred,
parse_amrfinder_vir_pred,
parse_emmtyper_pred,
parse_mykrobe_amr_pred,
parse_resfinder_amr_pred,
parse_shigapass_pred,
Expand Down
1 change: 1 addition & 0 deletions prp/parse/phenotype/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Module for parsing resistance prediction results."""

from .amrfinder import parse_amrfinder_amr_pred, parse_amrfinder_vir_pred
from .emmtyper import parse_emmtyper_pred
from .mykrobe import parse_mykrobe_amr_pred
from .resfinder import parse_resfinder_amr_pred
from .shigapass import parse_shigapass_pred
Expand Down
41 changes: 41 additions & 0 deletions prp/parse/phenotype/emmtyper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
"""Functions for parsing emmtyper result."""

import logging
import pandas as pd

from typing import Any

from ...models.typing import EmmTypingMethodIndex, TypingMethod, TypingResultEmm
from ...models.typing import TypingSoftware as Software

LOG = logging.getLogger(__name__)

def parse_emmtyper_pred(path: str) -> EmmTypingMethodIndex:
"""Parse emmtyper's output re emm-typing"""
LOG.info("Parsing emmtyper results")
pred_result = []
df = pd.read_csv(path, sep='\t', header=None)
df.columns = ["sample_name", "cluster_count", "emmtype", "emm_like_alleles", "emm_cluster"]
df_loa = df.to_dict(orient="records")
for emmtype_array in df_loa:
emmtype_results = _parse_emmtyper_results(emmtype_array)
pred_result.append(
EmmTypingMethodIndex(
type=TypingMethod.EMMTYPE,
result=emmtype_results,
software=Software.EMMTYPER,
)
)
return pred_result


def _parse_emmtyper_results(info: dict[str, Any]) -> TypingResultEmm:
"""Parse emm gene prediction results."""
emm_like_alleles = info["emm_like_alleles"].split(";")
return TypingResultEmm(
# info
cluster_count=info["cluster_count"],
emmtype=info["emmtype"],
emm_like_alleles=emm_like_alleles,
emm_cluster=info["emm_cluster"],
)
6 changes: 3 additions & 3 deletions prp/parse/qc.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,13 +255,13 @@ def parse_quast_results(tsv_fpath: str) -> QcMethodIndex:
raw = [dict(zip(header, row)) for row in creader]
qc_res = QuastQcResult(
total_length=int(raw[0]["Total length"]),
reference_length=raw[0]["Reference length"],
reference_length=raw[0].get("Reference length", None),
largest_contig=raw[0]["Largest contig"],
n_contigs=raw[0]["# contigs"],
n50=raw[0]["N50"],
assembly_gc=raw[0]["GC (%)"],
reference_gc=raw[0]["Reference GC (%)"],
duplication_ratio=raw[0]["Duplication ratio"],
reference_gc=raw[0].get("Reference GC (%)", None),
duplication_ratio=raw[0].get("Duplication ratio", None),
)
return QcMethodIndex(software=QcSoftware.QUAST, result=qc_res)

Expand Down
4 changes: 3 additions & 1 deletion prp/parse/typing.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,7 @@ def parse_mykrobe_lineage_results(pred_res: dict) -> MethodIndex | None:

def parse_virulencefinder_stx_typing(path: str) -> MethodIndex | None:
"""Parse virulencefinder's output re stx typing"""
LOG.info("Parsing virulencefinder stx results")
with open(path, "rb") as inpt:
pred_obj = json.load(inpt)
# if has valid results
Expand Down Expand Up @@ -230,7 +231,8 @@ def parse_virulencefinder_stx_typing(path: str) -> MethodIndex | None:


def parse_serotypefinder_oh_typing(path: str) -> MethodIndex | None:
"""Parse serotypefinder's output re OH typing"""
"""Parse 's output re OH typing"""
LOG.info("Parsing serotypefinder oh type results")
with open(path, "rb") as inpt:
pred_obj = json.load(inpt)
# if has valid results
Expand Down
1 change: 1 addition & 0 deletions tests/fixtures/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@
from .mtuberculosis import *
from .saureus import *
from .shigella import *
from .streptococcus import *
10 changes: 10 additions & 0 deletions tests/fixtures/streptococcus/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
"""Fixtures for Streptococcus."""
import pytest

from ..fixtures import data_path


@pytest.fixture()
def streptococcus_emmtyper_path(data_path):
"""Get path for Emmtyper results for streptococcus."""
return str(data_path.joinpath("streptococcus", "emmtyper.tsv"))
1 change: 1 addition & 0 deletions tests/fixtures/streptococcus/emmtyper.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
test1_240920_nb000000_0000_test 2 EMM169.3 EMM164.2~* E4
22 changes: 22 additions & 0 deletions tests/parse/test_emmtyper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
"""Test functions for parsing Emmtyper results."""

import pytest

from prp.parse.phenotype.emmtyper import parse_emmtyper_pred


def test_parse_emmtyper_results(streptococcus_emmtyper_path):
"""Test parsing of emmtyper result files."""

# test parsing the output of an streptococcus.
result = parse_emmtyper_pred(streptococcus_emmtyper_path)
expected_streptococcus = {
"cluster_count": 2,
"emmtype": "EMM169.3",
"emm_like_alleles": [
"EMM164.2~*"
],
"emm_cluster": "E4"
}
# check if data matches
assert expected_streptococcus == result[0].result.model_dump()
Loading