Skip to content

Commit

Permalink
Revert shigapass changes and create specific MedthodIndex for emmtyper
Browse files Browse the repository at this point in the history
  • Loading branch information
ryanjameskennedy committed Oct 8, 2024
1 parent 139e220 commit 7e726c9
Show file tree
Hide file tree
Showing 8 changed files with 109 additions and 97 deletions.
31 changes: 4 additions & 27 deletions prp/models/phenotype.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,15 +129,15 @@ class GeneBase(BaseModel):
sequence_name: Optional[str] = Field(
default=None, description="Reference sequence name"
)
element_type: Optional[ElementType] = Field(
default=None, description="The predominant function of the gene."
element_type: ElementType = Field(
description="The predominant function of the gene."
)
element_subtype: Optional[Union[
element_subtype: Union[
ElementStressSubtype,
ElementAmrSubtype,
ElementVirulenceSubtype,
ElementSerotypeSubtype,
]] = Field(default=None, description="Further functional categorization of the genes.")
] = Field(description="Further functional categorization of the genes.")
# position
ref_start_pos: Optional[int] = Field(
None, description="Alignment start in reference"
Expand Down Expand Up @@ -190,29 +190,6 @@ class SerotypeGene(GeneBase):
"""Container for serotype gene information"""


class EmmtypeGene(GeneBase):
"""Container for emmtype gene information"""

cluster_count: Optional[int] = None
emmtype: Optional[str] = None
emm_like_alleles: list[str] = None
emm_cluster: Optional[str] = None


class ShigatypeGene(GeneBase):
"""Container for shigatype gene information"""

rfb: Optional[str] = None
rfb_hits: Optional[float] = None
mlst: Optional[str] = None
flic: Optional[str] = None
crispr: Optional[str] = None
ipah: Optional[str] = None
predicted_serotype: Optional[str] = None
predicted_flex_serotype: Optional[str] = None
comments: Optional[str] = None


class VirulenceGene(GeneBase, DatabaseReference):
"""Container for virulence gene information"""

Expand Down
4 changes: 3 additions & 1 deletion prp/models/sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
from .species import SppMethodIndex
from .typing import (
ResultLineageBase,
ShigaTypingMethodIndex,
EmmTypingMethodIndex,
TbProfilerLineage,
TypingMethod,
TypingResultCgMlst,
Expand Down Expand Up @@ -79,7 +81,7 @@ class PipelineResult(SampleBase):

schema_version: Literal[1] = 1
# optional typing
typing_result: list[MethodIndex] = Field(
typing_result: list[Union[ShigaTypingMethodIndex, EmmTypingMethodIndex, MethodIndex]] = Field(
..., alias="typingResult"
)
# optional phenotype prediction
Expand Down
43 changes: 41 additions & 2 deletions prp/models/typing.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from pydantic import Field

from .base import RWModel
from .phenotype import SerotypeGene, VirulenceGene, ShigatypeGene, EmmtypeGene
from .phenotype import SerotypeGene, VirulenceGene


class TypingSoftware(str, Enum):
Expand Down Expand Up @@ -77,6 +77,45 @@ class TypingResultCgMlst(ResultMlstBase):
n_missing: int = Field(0, alias="nNovel")


class TypingResultShiga(RWModel):
"""Container for shigatype gene information"""

rfb: Optional[str] = None
rfb_hits: Optional[float] = None
mlst: Optional[str] = None
flic: Optional[str] = None
crispr: Optional[str] = None
ipah: Optional[str] = None
predicted_serotype: Optional[str] = None
predicted_flex_serotype: Optional[str] = None
comments: Optional[str] = None


class ShigaTypingMethodIndex(RWModel):
"""Method Index Shiga."""

type: Literal[TypingMethod.SHIGATYPE]
software: Literal[TypingSoftware.SHIGAPASS]
result: TypingResultShiga


class TypingResultEmm(RWModel):
"""Container for emmtype gene information"""

cluster_count: Optional[int] = None
emmtype: Optional[str] = None
emm_like_alleles: list[str] = None
emm_cluster: Optional[str] = None


class EmmTypingMethodIndex(RWModel):
"""Method Index Shiga."""

type: Literal[TypingMethod.EMMTYPE]
software: Literal[TypingSoftware.EMMTYPER]
result: TypingResultEmm


class ResultLineageBase(RWModel):
"""Lineage results"""

Expand All @@ -101,7 +140,7 @@ class TbProfilerLineage(ResultLineageBase):
lineages: list[LineageInformation]


class TypingResultGeneAllele(VirulenceGene, SerotypeGene, ShigatypeGene, EmmtypeGene):
class TypingResultGeneAllele(VirulenceGene, SerotypeGene):
"""Identification of individual gene alleles."""


Expand Down
4 changes: 2 additions & 2 deletions prp/parse/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,17 @@
from .phenotype import (
parse_amrfinder_amr_pred,
parse_amrfinder_vir_pred,
parse_emmtyper_pred,
parse_mykrobe_amr_pred,
parse_resfinder_amr_pred,
parse_shigapass_pred,
parse_tbprofiler_amr_pred,
parse_virulencefinder_vir_pred,
)
from .qc import parse_alignment_results, parse_postalignqc_results, parse_quast_results
from .species import parse_kraken_result
from .typing import (
parse_cgmlst_results,
parse_emmtyper_pred,
parse_shigapass_pred,
parse_mlst_results,
parse_mykrobe_lineage_results,
parse_serotypefinder_oh_typing,
Expand Down
2 changes: 2 additions & 0 deletions prp/parse/phenotype/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
"""Module for parsing resistance prediction results."""

from .amrfinder import parse_amrfinder_amr_pred, parse_amrfinder_vir_pred
from .emmtyper import parse_emmtyper_pred
from .mykrobe import parse_mykrobe_amr_pred
from .resfinder import parse_resfinder_amr_pred
from .shigapass import parse_shigapass_pred
from .tbprofiler import parse_tbprofiler_amr_pred
from .virulencefinder import parse_virulencefinder_vir_pred
35 changes: 24 additions & 11 deletions prp/parse/phenotype/emmtyper.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,41 @@
"""Functions for parsing emmtyper result."""

import logging
import pandas as pd

from typing import Any

from ...models.phenotype import ElementType, ElementVirulenceSubtype
from ...models.phenotype import (EmmtypeGene)
from ...models.typing import EmmTypingMethodIndex, TypingMethod, TypingResultEmm
from ...models.typing import TypingSoftware as Software

LOG = logging.getLogger(__name__)

def parse_emmtyper_pred(path: str) -> EmmTypingMethodIndex | None:
"""Parse emmtyper's output re emm-typing"""
LOG.info("Parsing emmtyper results")
pred_result = []
df = pd.read_csv(path, sep='\t', header=None)
df.columns = ["sample_name", "cluster_count", "emmtype", "emm_like_alleles", "emm_cluster"]
df_loa = df.to_dict(orient="records")
for emmtype_array in df_loa:
emmtype_results = _parse_emmtyper_results(emmtype_array)
pred_result.append(
EmmTypingMethodIndex(
type=TypingMethod.EMMTYPE,
result=emmtype_results,
software=Software.EMMTYPER,
)
)
return pred_result

def parse_emm_gene(
info: dict[str, Any], subtype: ElementVirulenceSubtype = ElementVirulenceSubtype.VIR
) -> EmmtypeGene:

emm_like_alleles = info["emm_like_alleles"].split(";")

def _parse_emmtyper_results(info: dict[str, Any]) -> TypingResultEmm:
"""Parse emm gene prediction results."""
return EmmtypeGene(
emm_like_alleles = info["emm_like_alleles"].split(";")
return TypingResultEmm(
# info
cluster_count=info["cluster_count"],
emmtype=info["emmtype"],
emm_like_alleles=emm_like_alleles,
emm_cluster=info["emm_cluster"],
# gene classification
element_type=ElementType.VIR,
element_subtype=subtype,
)
36 changes: 33 additions & 3 deletions prp/parse/phenotype/shigapass.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,43 @@
import logging
import re

import numpy as np
import pandas as pd

from ...models.phenotype import (ShigatypeGene)
from ...models.typing import ShigaTypingMethodIndex, TypingMethod, TypingResultShiga
from ...models.typing import TypingSoftware as Software

LOG = logging.getLogger(__name__)


def parse_shigapass_pred(path: str) -> ShigaTypingMethodIndex:
"""Parse shigapass prediction results."""
LOG.info("Parsing shigapass prediction")
cols = {
"Name": "sample_name",
"rfb_hits,(%)": "rfb_hits",
"MLST": "mlst",
"fliC": "flic",
"CRISPR": "crispr",
"ipaH": "ipah",
"Predicted_Serotype": "predicted_serotype",
"Predicted_FlexSerotype": "predicted_flex_serotype",
"Comments": "comments",
}
# read as dataframe and process data structure
hits = (
pd.read_csv(path, delimiter=";", na_values=["ND", "none"])
.rename(columns=cols)
.replace(np.nan, None)
)
shigatype_results = _parse_shigapass_results(hits, 0)
return ShigaTypingMethodIndex(
type=TypingMethod.SHIGATYPE,
result=shigatype_results,
software=Software.SHIGAPASS,
)


def _extract_percentage(rfb_hits: str) -> float:
pattern = r"([0-9\.]+)%"
match = re.search(pattern, rfb_hits)
Expand All @@ -20,8 +50,8 @@ def _extract_percentage(rfb_hits: str) -> float:
return percentile_value


def parse_shiga_gene(predictions: pd.DataFrame, row: int) -> ShigatypeGene:
return ShigatypeGene(
def _parse_shigapass_results(predictions: pd.DataFrame, row: int) -> TypingResultShiga:
return TypingResultShiga(
rfb=predictions.loc[row, "rfb"],
rfb_hits=_extract_percentage(str(predictions.loc[row, "rfb_hits"])),
mlst=predictions.loc[row, "mlst"],
Expand Down
51 changes: 0 additions & 51 deletions prp/parse/typing.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,6 @@
from ..models.typing import TypingSoftware as Software
from .phenotype.serotypefinder import parse_serotype_gene
from .phenotype.virulencefinder import parse_vir_gene
from .phenotype.emmtyper import parse_emm_gene
from .phenotype.shigapass import parse_shiga_gene

LOG = logging.getLogger(__name__)

Expand Down Expand Up @@ -260,52 +258,3 @@ def parse_serotypefinder_oh_typing(path: str) -> MethodIndex | None:
)
)
return pred_result


def parse_emmtyper_pred(path: str) -> MethodIndex | None:
"""Parse emmtyper's output re emm-typing"""
LOG.info("Parsing emmtyper results")
pred_result = []
df = pd.read_csv(path, sep='\t', header=None)
df.columns = ["sample_name", "cluster_count", "emmtype", "emm_like_alleles", "emm_cluster"]
df_loa = df.to_dict(orient="records")
for emmtype_array in df_loa:
emm_gene = parse_emm_gene(emmtype_array)
gene = TypingResultGeneAllele(**emm_gene.model_dump())
pred_result.append(
MethodIndex(
type=TypingMethod.EMMTYPE,
software=Software.EMMTYPER,
result=gene,
)
)
return pred_result


def parse_shigapass_pred(path: str) -> MethodIndex:
"""Parse shigapass prediction results."""
LOG.info("Parsing shigapass prediction")
cols = {
"Name": "sample_name",
"rfb_hits,(%)": "rfb_hits",
"MLST": "mlst",
"fliC": "flic",
"CRISPR": "crispr",
"ipaH": "ipah",
"Predicted_Serotype": "predicted_serotype",
"Predicted_FlexSerotype": "predicted_flex_serotype",
"Comments": "comments",
}
# read as dataframe and process data structure
hits = (
pd.read_csv(path, delimiter=";", na_values=["ND", "none"])
.rename(columns=cols)
.replace(np.nan, None)
)
shigatype_gene = parse_shiga_gene(hits, 0)
gene = TypingResultGeneAllele(**shigatype_gene.model_dump())
return MethodIndex(
type=TypingMethod.SHIGATYPE,
result=gene,
software=Software.SHIGAPASS,
)

0 comments on commit 7e726c9

Please sign in to comment.