From 5f37afdef254261b51a313c876a13b7c1c677f2a Mon Sep 17 00:00:00 2001 From: ryanjameskennedy Date: Mon, 7 Oct 2024 16:09:53 +0200 Subject: [PATCH 1/9] Add emmtyper to prp/cli.py --- prp/cli.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/prp/cli.py b/prp/cli.py index c9abf62..83af60e 100644 --- a/prp/cli.py +++ b/prp/cli.py @@ -23,6 +23,7 @@ parse_amrfinder_amr_pred, parse_amrfinder_vir_pred, parse_cgmlst_results, + parse_emmtyper_pred, parse_kraken_result, parse_mlst_results, parse_mykrobe_amr_pred, @@ -116,6 +117,7 @@ def cli(silent, debug): ) @click.option("-p", "--quality", type=click.Path(), help="postalignqc qc results") @click.option("-k", "--mykrobe", type=click.Path(), help="mykrobe results") +@click.option("-e", "--emmtyper", type=click.Path(), help="Emmtyper m-type prediction results") @click.option("-g", "--shigapass", type=click.Path(), help="shigapass results") @click.option("-t", "--tbprofiler", type=click.Path(), help="tbprofiler results") @click.option("--bam", type=click.Path(), help="Read mapping to reference genome") @@ -153,6 +155,7 @@ def create_bonsai_input( serotypefinder, quality, mykrobe, + emmtyper, shigapass, tbprofiler, bam, @@ -246,6 +249,13 @@ def create_bonsai_input( if res is not None: results["typing_result"].extend(res) + if emmtyper: + LOG.info("Parse emmtyper results") + # Emmtyping + res: MethodIndex | None = parse_emmtyper_pred(emmtyper) + if res is not None: + results["typing_result"].extend(res) + if shigapass: LOG.info("Parse shigapass results") # Shigatyping From 53c67ab0c40f6c5c4d6f2612662f34ced2d7bd4c Mon Sep 17 00:00:00 2001 From: ryanjameskennedy Date: Mon, 7 Oct 2024 16:11:34 +0200 Subject: [PATCH 2/9] Make ref genome rel variables optional in quast --- prp/models/qc.py | 6 +++--- prp/parse/qc.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/prp/models/qc.py b/prp/models/qc.py index 4095f24..cfa16bd 100644 --- a/prp/models/qc.py +++ b/prp/models/qc.py @@ -20,13 +20,13 @@ class QuastQcResult(BaseModel): """Assembly QC metrics.""" total_length: int - reference_length: int + reference_length: int | None = None largest_contig: int n_contigs: int n50: int assembly_gc: float - reference_gc: float - duplication_ratio: float + reference_gc: float | None = None + duplication_ratio: float | None = None class PostAlignQcResult(BaseModel): diff --git a/prp/parse/qc.py b/prp/parse/qc.py index 3ee87e5..aa93e21 100644 --- a/prp/parse/qc.py +++ b/prp/parse/qc.py @@ -255,13 +255,13 @@ def parse_quast_results(tsv_fpath: str) -> QcMethodIndex: raw = [dict(zip(header, row)) for row in creader] qc_res = QuastQcResult( total_length=int(raw[0]["Total length"]), - reference_length=raw[0]["Reference length"], + reference_length=raw[0].get("Reference length", None), largest_contig=raw[0]["Largest contig"], n_contigs=raw[0]["# contigs"], n50=raw[0]["N50"], assembly_gc=raw[0]["GC (%)"], - reference_gc=raw[0]["Reference GC (%)"], - duplication_ratio=raw[0]["Duplication ratio"], + reference_gc=raw[0].get("Reference GC (%)", None), + duplication_ratio=raw[0].get("Duplication ratio", None), ) return QcMethodIndex(software=QcSoftware.QUAST, result=qc_res) From 03180735b1269024e3d21e48b5fb78a239544d36 Mon Sep 17 00:00:00 2001 From: ryanjameskennedy Date: Mon, 7 Oct 2024 16:15:38 +0200 Subject: [PATCH 3/9] Add emmtyper parser and model and move shigapass --- prp/models/phenotype.py | 31 ++++++++++++++--- prp/models/sample.py | 3 +- prp/models/typing.py | 28 +++------------- prp/parse/__init__.py | 3 +- prp/parse/phenotype/__init__.py | 1 - prp/parse/phenotype/emmtyper.py | 28 ++++++++++++++++ prp/parse/phenotype/shigapass.py | 36 ++------------------ prp/parse/typing.py | 57 +++++++++++++++++++++++++++++++- 8 files changed, 121 insertions(+), 66 deletions(-) create mode 100644 prp/parse/phenotype/emmtyper.py diff --git a/prp/models/phenotype.py b/prp/models/phenotype.py index 2496d87..3e440e4 100644 --- a/prp/models/phenotype.py +++ b/prp/models/phenotype.py @@ -129,15 +129,15 @@ class GeneBase(BaseModel): sequence_name: Optional[str] = Field( default=None, description="Reference sequence name" ) - element_type: ElementType = Field( - description="The predominant function fo the gene." + element_type: Optional[ElementType] = Field( + default=None, description="The predominant function of the gene." ) - element_subtype: Union[ + element_subtype: Optional[Union[ ElementStressSubtype, ElementAmrSubtype, ElementVirulenceSubtype, ElementSerotypeSubtype, - ] = Field(description="Further functional categorization of the genes.") + ]] = Field(default=None, description="Further functional categorization of the genes.") # position ref_start_pos: Optional[int] = Field( None, description="Alignment start in reference" @@ -190,6 +190,29 @@ class SerotypeGene(GeneBase): """Container for serotype gene information""" +class EmmtypeGene(GeneBase): + """Container for emmtype gene information""" + + cluster_count: Optional[int] = None + emmtype: Optional[str] = None + emm_like_alleles: list[str] = None + emm_cluster: Optional[str] = None + + +class ShigatypeGene(GeneBase): + """Container for shigatype gene information""" + + rfb: Optional[str] = None + rfb_hits: Optional[float] = None + mlst: Optional[str] = None + flic: Optional[str] = None + crispr: Optional[str] = None + ipah: Optional[str] = None + predicted_serotype: Optional[str] = None + predicted_flex_serotype: Optional[str] = None + comments: Optional[str] = None + + class VirulenceGene(GeneBase, DatabaseReference): """Container for virulence gene information""" diff --git a/prp/models/sample.py b/prp/models/sample.py index ecdd8e2..80ae9dd 100644 --- a/prp/models/sample.py +++ b/prp/models/sample.py @@ -16,7 +16,6 @@ from .species import SppMethodIndex from .typing import ( ResultLineageBase, - ShigaTypingMethodIndex, TbProfilerLineage, TypingMethod, TypingResultCgMlst, @@ -80,7 +79,7 @@ class PipelineResult(SampleBase): schema_version: Literal[1] = 1 # optional typing - typing_result: list[Union[ShigaTypingMethodIndex, MethodIndex]] = Field( + typing_result: list[MethodIndex] = Field( ..., alias="typingResult" ) # optional phenotype prediction diff --git a/prp/models/typing.py b/prp/models/typing.py index 20795ca..0630f40 100644 --- a/prp/models/typing.py +++ b/prp/models/typing.py @@ -6,7 +6,7 @@ from pydantic import Field from .base import RWModel -from .phenotype import SerotypeGene, VirulenceGene +from .phenotype import SerotypeGene, VirulenceGene, ShigatypeGene, EmmtypeGene class TypingSoftware(str, Enum): @@ -19,6 +19,7 @@ class TypingSoftware(str, Enum): VIRULENCEFINDER = "virulencefinder" SEROTYPEFINDER = "serotypefinder" SHIGAPASS = "shigapass" + EMMTYPER = "emmtyper" class TypingMethod(str, Enum): @@ -31,6 +32,7 @@ class TypingMethod(str, Enum): OTYPE = "O_type" HTYPE = "H_type" SHIGATYPE = "shigatype" + EMMTYPE = "emmtype" class ChewbbacaErrors(str, Enum): @@ -75,28 +77,6 @@ class TypingResultCgMlst(ResultMlstBase): n_missing: int = Field(0, alias="nNovel") -class TypingResultShiga(RWModel): - """Container for shigatype gene information""" - - rfb: Optional[str] = None - rfb_hits: Optional[float] = None - mlst: Optional[str] = None - flic: Optional[str] = None - crispr: Optional[str] = None - ipah: Optional[str] = None - predicted_serotype: Optional[str] = None - predicted_flex_serotype: Optional[str] = None - comments: Optional[str] = None - - -class ShigaTypingMethodIndex(RWModel): - """Method Index Shiga.""" - - type: Literal[TypingMethod.SHIGATYPE] - software: Literal[TypingSoftware.SHIGAPASS] - result: TypingResultShiga - - class ResultLineageBase(RWModel): """Lineage results""" @@ -121,7 +101,7 @@ class TbProfilerLineage(ResultLineageBase): lineages: list[LineageInformation] -class TypingResultGeneAllele(VirulenceGene, SerotypeGene): +class TypingResultGeneAllele(VirulenceGene, SerotypeGene, ShigatypeGene, EmmtypeGene): """Identification of individual gene alleles.""" diff --git a/prp/parse/__init__.py b/prp/parse/__init__.py index 5879422..483ed1f 100644 --- a/prp/parse/__init__.py +++ b/prp/parse/__init__.py @@ -5,7 +5,6 @@ parse_amrfinder_vir_pred, parse_mykrobe_amr_pred, parse_resfinder_amr_pred, - parse_shigapass_pred, parse_tbprofiler_amr_pred, parse_virulencefinder_vir_pred, ) @@ -13,6 +12,8 @@ from .species import parse_kraken_result from .typing import ( parse_cgmlst_results, + parse_emmtyper_pred, + parse_shigapass_pred, parse_mlst_results, parse_mykrobe_lineage_results, parse_serotypefinder_oh_typing, diff --git a/prp/parse/phenotype/__init__.py b/prp/parse/phenotype/__init__.py index 88f9950..5c83ad2 100644 --- a/prp/parse/phenotype/__init__.py +++ b/prp/parse/phenotype/__init__.py @@ -3,6 +3,5 @@ from .amrfinder import parse_amrfinder_amr_pred, parse_amrfinder_vir_pred from .mykrobe import parse_mykrobe_amr_pred from .resfinder import parse_resfinder_amr_pred -from .shigapass import parse_shigapass_pred from .tbprofiler import parse_tbprofiler_amr_pred from .virulencefinder import parse_virulencefinder_vir_pred diff --git a/prp/parse/phenotype/emmtyper.py b/prp/parse/phenotype/emmtyper.py new file mode 100644 index 0000000..7c4d031 --- /dev/null +++ b/prp/parse/phenotype/emmtyper.py @@ -0,0 +1,28 @@ +"""Functions for parsing emmtyper result.""" + +import logging + +from typing import Any + +from ...models.phenotype import ElementType, ElementVirulenceSubtype +from ...models.phenotype import (EmmtypeGene) + +LOG = logging.getLogger(__name__) + + +def parse_emm_gene( + info: dict[str, Any], subtype: ElementVirulenceSubtype = ElementVirulenceSubtype.VIR +) -> EmmtypeGene: + + emm_like_alleles = info["emm_like_alleles"].split(";") + """Parse emm gene prediction results.""" + return EmmtypeGene( + # info + cluster_count=info["cluster_count"], + emmtype=info["emmtype"], + emm_like_alleles=emm_like_alleles, + emm_cluster=info["emm_cluster"], + # gene classification + element_type=ElementType.VIR, + element_subtype=subtype, + ) diff --git a/prp/parse/phenotype/shigapass.py b/prp/parse/phenotype/shigapass.py index 4357144..7f570ad 100644 --- a/prp/parse/phenotype/shigapass.py +++ b/prp/parse/phenotype/shigapass.py @@ -3,43 +3,13 @@ import logging import re -import numpy as np import pandas as pd -from ...models.typing import ShigaTypingMethodIndex, TypingMethod, TypingResultShiga -from ...models.typing import TypingSoftware as Software +from ...models.phenotype import (ShigatypeGene) LOG = logging.getLogger(__name__) -def parse_shigapass_pred(path: str) -> ShigaTypingMethodIndex: - """Parse shigapass prediction results.""" - LOG.info("Parsing shigapass prediction") - cols = { - "Name": "sample_name", - "rfb_hits,(%)": "rfb_hits", - "MLST": "mlst", - "fliC": "flic", - "CRISPR": "crispr", - "ipaH": "ipah", - "Predicted_Serotype": "predicted_serotype", - "Predicted_FlexSerotype": "predicted_flex_serotype", - "Comments": "comments", - } - # read as dataframe and process data structure - hits = ( - pd.read_csv(path, delimiter=";", na_values=["ND", "none"]) - .rename(columns=cols) - .replace(np.nan, None) - ) - shigatype_results = _parse_shigapass_results(hits, 0) - return ShigaTypingMethodIndex( - type=TypingMethod.SHIGATYPE, - result=shigatype_results, - software=Software.SHIGAPASS, - ) - - def _extract_percentage(rfb_hits: str) -> float: pattern = r"([0-9\.]+)%" match = re.search(pattern, rfb_hits) @@ -50,8 +20,8 @@ def _extract_percentage(rfb_hits: str) -> float: return percentile_value -def _parse_shigapass_results(predictions: pd.DataFrame, row: int) -> TypingResultShiga: - return TypingResultShiga( +def parse_shiga_gene(predictions: pd.DataFrame, row: int) -> ShigatypeGene: + return ShigatypeGene( rfb=predictions.loc[row, "rfb"], rfb_hits=_extract_percentage(str(predictions.loc[row, "rfb_hits"])), mlst=predictions.loc[row, "mlst"], diff --git a/prp/parse/typing.py b/prp/parse/typing.py index 0fe1d09..9227e5c 100644 --- a/prp/parse/typing.py +++ b/prp/parse/typing.py @@ -3,6 +3,8 @@ import csv import json import logging +import pandas as pd +import numpy as np from ..models.sample import MethodIndex from ..models.typing import ( @@ -18,6 +20,8 @@ from ..models.typing import TypingSoftware as Software from .phenotype.serotypefinder import parse_serotype_gene from .phenotype.virulencefinder import parse_vir_gene +from .phenotype.emmtyper import parse_emm_gene +from .phenotype.shigapass import parse_shiga_gene LOG = logging.getLogger(__name__) @@ -201,6 +205,7 @@ def parse_mykrobe_lineage_results(pred_res: dict) -> MethodIndex | None: def parse_virulencefinder_stx_typing(path: str) -> MethodIndex | None: """Parse virulencefinder's output re stx typing""" + LOG.info("Parsing virulencefinder stx results") with open(path, "rb") as inpt: pred_obj = json.load(inpt) # if has valid results @@ -230,7 +235,8 @@ def parse_virulencefinder_stx_typing(path: str) -> MethodIndex | None: def parse_serotypefinder_oh_typing(path: str) -> MethodIndex | None: - """Parse serotypefinder's output re OH typing""" + """Parse 's output re OH typing""" + LOG.info("Parsing serotypefinder oh type results") with open(path, "rb") as inpt: pred_obj = json.load(inpt) # if has valid results @@ -254,3 +260,52 @@ def parse_serotypefinder_oh_typing(path: str) -> MethodIndex | None: ) ) return pred_result + + +def parse_emmtyper_pred(path: str) -> MethodIndex | None: + """Parse emmtyper's output re emm-typing""" + LOG.info("Parsing emmtyper results") + pred_result = [] + df = pd.read_csv(path, sep='\t', header=None) + df.columns = ["sample_name", "cluster_count", "emmtype", "emm_like_alleles", "emm_cluster"] + df_loa = df.to_dict(orient="records") + for emmtype_array in df_loa: + emm_gene = parse_emm_gene(emmtype_array) + gene = TypingResultGeneAllele(**emm_gene.model_dump()) + pred_result.append( + MethodIndex( + type=TypingMethod.EMMTYPE, + software=Software.EMMTYPER, + result=gene, + ) + ) + return pred_result + + +def parse_shigapass_pred(path: str) -> MethodIndex: + """Parse shigapass prediction results.""" + LOG.info("Parsing shigapass prediction") + cols = { + "Name": "sample_name", + "rfb_hits,(%)": "rfb_hits", + "MLST": "mlst", + "fliC": "flic", + "CRISPR": "crispr", + "ipaH": "ipah", + "Predicted_Serotype": "predicted_serotype", + "Predicted_FlexSerotype": "predicted_flex_serotype", + "Comments": "comments", + } + # read as dataframe and process data structure + hits = ( + pd.read_csv(path, delimiter=";", na_values=["ND", "none"]) + .rename(columns=cols) + .replace(np.nan, None) + ) + shigatype_gene = parse_shiga_gene(hits, 0) + gene = TypingResultGeneAllele(**shigatype_gene.model_dump()) + return MethodIndex( + type=TypingMethod.SHIGATYPE, + result=gene, + software=Software.SHIGAPASS, + ) From 139e2200e522c8abca8f06fe9949488203a07e71 Mon Sep 17 00:00:00 2001 From: ryanjameskennedy Date: Mon, 7 Oct 2024 16:19:48 +0200 Subject: [PATCH 4/9] Update CHANGELOG re emmtyper addition, and shigapass & quast changes --- CHANGELOG.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a5740cd..c05a59c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,17 +2,23 @@ ### Added + - Added emmtyper and parser + ### Fixed ### Changed + - Changed Shigapass models to be consistent with other typing models + - Changed Shigapass parsers to be consistent with other typing parsers + - Changed ref genome related variables to be optional in quast + ## [0.10.1] ### Added ### Fixed -- Updated parsing of ChewBBACA allele calling annotations and novel alleles. This adds support for annotations introduced in v3. + - Updated parsing of ChewBBACA allele calling annotations and novel alleles. This adds support for annotations introduced in v3. ### Changed From 7e726c93ff3b9716a5771c9c4b7eb47acce8c0ba Mon Sep 17 00:00:00 2001 From: ryanjameskennedy Date: Tue, 8 Oct 2024 16:48:37 +0200 Subject: [PATCH 5/9] Revert shigapass changes and create specific MedthodIndex for emmtyper --- prp/models/phenotype.py | 31 +++---------------- prp/models/sample.py | 4 ++- prp/models/typing.py | 43 +++++++++++++++++++++++++-- prp/parse/__init__.py | 4 +-- prp/parse/phenotype/__init__.py | 2 ++ prp/parse/phenotype/emmtyper.py | 35 +++++++++++++++------- prp/parse/phenotype/shigapass.py | 36 ++++++++++++++++++++-- prp/parse/typing.py | 51 -------------------------------- 8 files changed, 109 insertions(+), 97 deletions(-) diff --git a/prp/models/phenotype.py b/prp/models/phenotype.py index 3e440e4..40b37e6 100644 --- a/prp/models/phenotype.py +++ b/prp/models/phenotype.py @@ -129,15 +129,15 @@ class GeneBase(BaseModel): sequence_name: Optional[str] = Field( default=None, description="Reference sequence name" ) - element_type: Optional[ElementType] = Field( - default=None, description="The predominant function of the gene." + element_type: ElementType = Field( + description="The predominant function of the gene." ) - element_subtype: Optional[Union[ + element_subtype: Union[ ElementStressSubtype, ElementAmrSubtype, ElementVirulenceSubtype, ElementSerotypeSubtype, - ]] = Field(default=None, description="Further functional categorization of the genes.") + ] = Field(description="Further functional categorization of the genes.") # position ref_start_pos: Optional[int] = Field( None, description="Alignment start in reference" @@ -190,29 +190,6 @@ class SerotypeGene(GeneBase): """Container for serotype gene information""" -class EmmtypeGene(GeneBase): - """Container for emmtype gene information""" - - cluster_count: Optional[int] = None - emmtype: Optional[str] = None - emm_like_alleles: list[str] = None - emm_cluster: Optional[str] = None - - -class ShigatypeGene(GeneBase): - """Container for shigatype gene information""" - - rfb: Optional[str] = None - rfb_hits: Optional[float] = None - mlst: Optional[str] = None - flic: Optional[str] = None - crispr: Optional[str] = None - ipah: Optional[str] = None - predicted_serotype: Optional[str] = None - predicted_flex_serotype: Optional[str] = None - comments: Optional[str] = None - - class VirulenceGene(GeneBase, DatabaseReference): """Container for virulence gene information""" diff --git a/prp/models/sample.py b/prp/models/sample.py index 80ae9dd..ec66694 100644 --- a/prp/models/sample.py +++ b/prp/models/sample.py @@ -16,6 +16,8 @@ from .species import SppMethodIndex from .typing import ( ResultLineageBase, + ShigaTypingMethodIndex, + EmmTypingMethodIndex, TbProfilerLineage, TypingMethod, TypingResultCgMlst, @@ -79,7 +81,7 @@ class PipelineResult(SampleBase): schema_version: Literal[1] = 1 # optional typing - typing_result: list[MethodIndex] = Field( + typing_result: list[Union[ShigaTypingMethodIndex, EmmTypingMethodIndex, MethodIndex]] = Field( ..., alias="typingResult" ) # optional phenotype prediction diff --git a/prp/models/typing.py b/prp/models/typing.py index 0630f40..1e4743c 100644 --- a/prp/models/typing.py +++ b/prp/models/typing.py @@ -6,7 +6,7 @@ from pydantic import Field from .base import RWModel -from .phenotype import SerotypeGene, VirulenceGene, ShigatypeGene, EmmtypeGene +from .phenotype import SerotypeGene, VirulenceGene class TypingSoftware(str, Enum): @@ -77,6 +77,45 @@ class TypingResultCgMlst(ResultMlstBase): n_missing: int = Field(0, alias="nNovel") +class TypingResultShiga(RWModel): + """Container for shigatype gene information""" + + rfb: Optional[str] = None + rfb_hits: Optional[float] = None + mlst: Optional[str] = None + flic: Optional[str] = None + crispr: Optional[str] = None + ipah: Optional[str] = None + predicted_serotype: Optional[str] = None + predicted_flex_serotype: Optional[str] = None + comments: Optional[str] = None + + +class ShigaTypingMethodIndex(RWModel): + """Method Index Shiga.""" + + type: Literal[TypingMethod.SHIGATYPE] + software: Literal[TypingSoftware.SHIGAPASS] + result: TypingResultShiga + + +class TypingResultEmm(RWModel): + """Container for emmtype gene information""" + + cluster_count: Optional[int] = None + emmtype: Optional[str] = None + emm_like_alleles: list[str] = None + emm_cluster: Optional[str] = None + + +class EmmTypingMethodIndex(RWModel): + """Method Index Shiga.""" + + type: Literal[TypingMethod.EMMTYPE] + software: Literal[TypingSoftware.EMMTYPER] + result: TypingResultEmm + + class ResultLineageBase(RWModel): """Lineage results""" @@ -101,7 +140,7 @@ class TbProfilerLineage(ResultLineageBase): lineages: list[LineageInformation] -class TypingResultGeneAllele(VirulenceGene, SerotypeGene, ShigatypeGene, EmmtypeGene): +class TypingResultGeneAllele(VirulenceGene, SerotypeGene): """Identification of individual gene alleles.""" diff --git a/prp/parse/__init__.py b/prp/parse/__init__.py index 483ed1f..b7eb5ae 100644 --- a/prp/parse/__init__.py +++ b/prp/parse/__init__.py @@ -3,8 +3,10 @@ from .phenotype import ( parse_amrfinder_amr_pred, parse_amrfinder_vir_pred, + parse_emmtyper_pred, parse_mykrobe_amr_pred, parse_resfinder_amr_pred, + parse_shigapass_pred, parse_tbprofiler_amr_pred, parse_virulencefinder_vir_pred, ) @@ -12,8 +14,6 @@ from .species import parse_kraken_result from .typing import ( parse_cgmlst_results, - parse_emmtyper_pred, - parse_shigapass_pred, parse_mlst_results, parse_mykrobe_lineage_results, parse_serotypefinder_oh_typing, diff --git a/prp/parse/phenotype/__init__.py b/prp/parse/phenotype/__init__.py index 5c83ad2..f98a2b4 100644 --- a/prp/parse/phenotype/__init__.py +++ b/prp/parse/phenotype/__init__.py @@ -1,7 +1,9 @@ """Module for parsing resistance prediction results.""" from .amrfinder import parse_amrfinder_amr_pred, parse_amrfinder_vir_pred +from .emmtyper import parse_emmtyper_pred from .mykrobe import parse_mykrobe_amr_pred from .resfinder import parse_resfinder_amr_pred +from .shigapass import parse_shigapass_pred from .tbprofiler import parse_tbprofiler_amr_pred from .virulencefinder import parse_virulencefinder_vir_pred diff --git a/prp/parse/phenotype/emmtyper.py b/prp/parse/phenotype/emmtyper.py index 7c4d031..52b5f27 100644 --- a/prp/parse/phenotype/emmtyper.py +++ b/prp/parse/phenotype/emmtyper.py @@ -1,28 +1,41 @@ """Functions for parsing emmtyper result.""" import logging +import pandas as pd from typing import Any -from ...models.phenotype import ElementType, ElementVirulenceSubtype -from ...models.phenotype import (EmmtypeGene) +from ...models.typing import EmmTypingMethodIndex, TypingMethod, TypingResultEmm +from ...models.typing import TypingSoftware as Software LOG = logging.getLogger(__name__) +def parse_emmtyper_pred(path: str) -> EmmTypingMethodIndex | None: + """Parse emmtyper's output re emm-typing""" + LOG.info("Parsing emmtyper results") + pred_result = [] + df = pd.read_csv(path, sep='\t', header=None) + df.columns = ["sample_name", "cluster_count", "emmtype", "emm_like_alleles", "emm_cluster"] + df_loa = df.to_dict(orient="records") + for emmtype_array in df_loa: + emmtype_results = _parse_emmtyper_results(emmtype_array) + pred_result.append( + EmmTypingMethodIndex( + type=TypingMethod.EMMTYPE, + result=emmtype_results, + software=Software.EMMTYPER, + ) + ) + return pred_result -def parse_emm_gene( - info: dict[str, Any], subtype: ElementVirulenceSubtype = ElementVirulenceSubtype.VIR -) -> EmmtypeGene: - - emm_like_alleles = info["emm_like_alleles"].split(";") + +def _parse_emmtyper_results(info: dict[str, Any]) -> TypingResultEmm: """Parse emm gene prediction results.""" - return EmmtypeGene( + emm_like_alleles = info["emm_like_alleles"].split(";") + return TypingResultEmm( # info cluster_count=info["cluster_count"], emmtype=info["emmtype"], emm_like_alleles=emm_like_alleles, emm_cluster=info["emm_cluster"], - # gene classification - element_type=ElementType.VIR, - element_subtype=subtype, ) diff --git a/prp/parse/phenotype/shigapass.py b/prp/parse/phenotype/shigapass.py index 7f570ad..4357144 100644 --- a/prp/parse/phenotype/shigapass.py +++ b/prp/parse/phenotype/shigapass.py @@ -3,13 +3,43 @@ import logging import re +import numpy as np import pandas as pd -from ...models.phenotype import (ShigatypeGene) +from ...models.typing import ShigaTypingMethodIndex, TypingMethod, TypingResultShiga +from ...models.typing import TypingSoftware as Software LOG = logging.getLogger(__name__) +def parse_shigapass_pred(path: str) -> ShigaTypingMethodIndex: + """Parse shigapass prediction results.""" + LOG.info("Parsing shigapass prediction") + cols = { + "Name": "sample_name", + "rfb_hits,(%)": "rfb_hits", + "MLST": "mlst", + "fliC": "flic", + "CRISPR": "crispr", + "ipaH": "ipah", + "Predicted_Serotype": "predicted_serotype", + "Predicted_FlexSerotype": "predicted_flex_serotype", + "Comments": "comments", + } + # read as dataframe and process data structure + hits = ( + pd.read_csv(path, delimiter=";", na_values=["ND", "none"]) + .rename(columns=cols) + .replace(np.nan, None) + ) + shigatype_results = _parse_shigapass_results(hits, 0) + return ShigaTypingMethodIndex( + type=TypingMethod.SHIGATYPE, + result=shigatype_results, + software=Software.SHIGAPASS, + ) + + def _extract_percentage(rfb_hits: str) -> float: pattern = r"([0-9\.]+)%" match = re.search(pattern, rfb_hits) @@ -20,8 +50,8 @@ def _extract_percentage(rfb_hits: str) -> float: return percentile_value -def parse_shiga_gene(predictions: pd.DataFrame, row: int) -> ShigatypeGene: - return ShigatypeGene( +def _parse_shigapass_results(predictions: pd.DataFrame, row: int) -> TypingResultShiga: + return TypingResultShiga( rfb=predictions.loc[row, "rfb"], rfb_hits=_extract_percentage(str(predictions.loc[row, "rfb_hits"])), mlst=predictions.loc[row, "mlst"], diff --git a/prp/parse/typing.py b/prp/parse/typing.py index 9227e5c..3e52b35 100644 --- a/prp/parse/typing.py +++ b/prp/parse/typing.py @@ -20,8 +20,6 @@ from ..models.typing import TypingSoftware as Software from .phenotype.serotypefinder import parse_serotype_gene from .phenotype.virulencefinder import parse_vir_gene -from .phenotype.emmtyper import parse_emm_gene -from .phenotype.shigapass import parse_shiga_gene LOG = logging.getLogger(__name__) @@ -260,52 +258,3 @@ def parse_serotypefinder_oh_typing(path: str) -> MethodIndex | None: ) ) return pred_result - - -def parse_emmtyper_pred(path: str) -> MethodIndex | None: - """Parse emmtyper's output re emm-typing""" - LOG.info("Parsing emmtyper results") - pred_result = [] - df = pd.read_csv(path, sep='\t', header=None) - df.columns = ["sample_name", "cluster_count", "emmtype", "emm_like_alleles", "emm_cluster"] - df_loa = df.to_dict(orient="records") - for emmtype_array in df_loa: - emm_gene = parse_emm_gene(emmtype_array) - gene = TypingResultGeneAllele(**emm_gene.model_dump()) - pred_result.append( - MethodIndex( - type=TypingMethod.EMMTYPE, - software=Software.EMMTYPER, - result=gene, - ) - ) - return pred_result - - -def parse_shigapass_pred(path: str) -> MethodIndex: - """Parse shigapass prediction results.""" - LOG.info("Parsing shigapass prediction") - cols = { - "Name": "sample_name", - "rfb_hits,(%)": "rfb_hits", - "MLST": "mlst", - "fliC": "flic", - "CRISPR": "crispr", - "ipaH": "ipah", - "Predicted_Serotype": "predicted_serotype", - "Predicted_FlexSerotype": "predicted_flex_serotype", - "Comments": "comments", - } - # read as dataframe and process data structure - hits = ( - pd.read_csv(path, delimiter=";", na_values=["ND", "none"]) - .rename(columns=cols) - .replace(np.nan, None) - ) - shigatype_gene = parse_shiga_gene(hits, 0) - gene = TypingResultGeneAllele(**shigatype_gene.model_dump()) - return MethodIndex( - type=TypingMethod.SHIGATYPE, - result=gene, - software=Software.SHIGAPASS, - ) From 7b89b88b0a27b281f70d86311eeefc4ca5ed4105 Mon Sep 17 00:00:00 2001 From: ryanjameskennedy Date: Wed, 9 Oct 2024 14:05:21 +0200 Subject: [PATCH 6/9] Update expected output format/structure --- prp/models/typing.py | 8 ++++---- prp/parse/phenotype/emmtyper.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/prp/models/typing.py b/prp/models/typing.py index 1e4743c..b5d34ef 100644 --- a/prp/models/typing.py +++ b/prp/models/typing.py @@ -102,10 +102,10 @@ class ShigaTypingMethodIndex(RWModel): class TypingResultEmm(RWModel): """Container for emmtype gene information""" - cluster_count: Optional[int] = None - emmtype: Optional[str] = None - emm_like_alleles: list[str] = None - emm_cluster: Optional[str] = None + cluster_count: int + emmtype: str + emm_like_alleles: list[str] + emm_cluster: str class EmmTypingMethodIndex(RWModel): diff --git a/prp/parse/phenotype/emmtyper.py b/prp/parse/phenotype/emmtyper.py index 52b5f27..0e4049a 100644 --- a/prp/parse/phenotype/emmtyper.py +++ b/prp/parse/phenotype/emmtyper.py @@ -10,7 +10,7 @@ LOG = logging.getLogger(__name__) -def parse_emmtyper_pred(path: str) -> EmmTypingMethodIndex | None: +def parse_emmtyper_pred(path: str) -> EmmTypingMethodIndex: """Parse emmtyper's output re emm-typing""" LOG.info("Parsing emmtyper results") pred_result = [] From 0f1f1ff3c3c3d142e5e4278c65470caafbf0c0b0 Mon Sep 17 00:00:00 2001 From: ryanjameskennedy Date: Wed, 9 Oct 2024 14:05:54 +0200 Subject: [PATCH 7/9] Add tests for emmtyper --- tests/fixtures/__init__.py | 1 + tests/fixtures/streptococcus/__init__.py | 10 ++++++++++ tests/fixtures/streptococcus/emmtyper.tsv | 1 + tests/parse/test_emmtyper.py | 22 ++++++++++++++++++++++ 4 files changed, 34 insertions(+) create mode 100644 tests/fixtures/streptococcus/__init__.py create mode 100644 tests/fixtures/streptococcus/emmtyper.tsv create mode 100644 tests/parse/test_emmtyper.py diff --git a/tests/fixtures/__init__.py b/tests/fixtures/__init__.py index bc525d3..214fa3e 100644 --- a/tests/fixtures/__init__.py +++ b/tests/fixtures/__init__.py @@ -4,3 +4,4 @@ from .mtuberculosis import * from .saureus import * from .shigella import * +from .streptococcus import * diff --git a/tests/fixtures/streptococcus/__init__.py b/tests/fixtures/streptococcus/__init__.py new file mode 100644 index 0000000..6d02b5c --- /dev/null +++ b/tests/fixtures/streptococcus/__init__.py @@ -0,0 +1,10 @@ +"""Fixtures for Streptococcus.""" +import pytest + +from ..fixtures import data_path + + +@pytest.fixture() +def streptococcus_emmtyper_path(data_path): + """Get path for Emmtyper results for streptococcus.""" + return str(data_path.joinpath("streptococcus", "emmtyper.tsv")) diff --git a/tests/fixtures/streptococcus/emmtyper.tsv b/tests/fixtures/streptococcus/emmtyper.tsv new file mode 100644 index 0000000..06ba33b --- /dev/null +++ b/tests/fixtures/streptococcus/emmtyper.tsv @@ -0,0 +1 @@ +test1_240920_nb000000_0000_test 2 EMM169.3 EMM164.2~* E4 diff --git a/tests/parse/test_emmtyper.py b/tests/parse/test_emmtyper.py new file mode 100644 index 0000000..8653085 --- /dev/null +++ b/tests/parse/test_emmtyper.py @@ -0,0 +1,22 @@ +"""Test functions for parsing Emmtyper results.""" + +import pytest + +from prp.parse.phenotype.emmtyper import parse_emmtyper_pred + + +def test_parse_emmtyper_results(streptococcus_emmtyper_path): + """Test parsing of emmtyper result files.""" + + # test parsing the output of an streptococcus. + result = parse_emmtyper_pred(streptococcus_emmtyper_path) + expected_streptococcus = { + "cluster_count": 2, + "emmtype": "EMM169.3", + "emm_like_alleles": [ + "EMM164.2~*" + ], + "emm_cluster": "E4" + } + # check if data matches + assert expected_streptococcus == result[0].result.model_dump() \ No newline at end of file From e8b27fa6ee4a66041c6a6ca9a2f02ac200a46d4f Mon Sep 17 00:00:00 2001 From: ryanjameskennedy Date: Thu, 10 Oct 2024 09:35:25 +0200 Subject: [PATCH 8/9] Quick spelling and unused module fixes --- prp/models/typing.py | 2 +- prp/parse/typing.py | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/prp/models/typing.py b/prp/models/typing.py index b5d34ef..4d04e00 100644 --- a/prp/models/typing.py +++ b/prp/models/typing.py @@ -109,7 +109,7 @@ class TypingResultEmm(RWModel): class EmmTypingMethodIndex(RWModel): - """Method Index Shiga.""" + """Method Index Emm.""" type: Literal[TypingMethod.EMMTYPE] software: Literal[TypingSoftware.EMMTYPER] diff --git a/prp/parse/typing.py b/prp/parse/typing.py index 3e52b35..6b4cec8 100644 --- a/prp/parse/typing.py +++ b/prp/parse/typing.py @@ -3,8 +3,6 @@ import csv import json import logging -import pandas as pd -import numpy as np from ..models.sample import MethodIndex from ..models.typing import ( From b25c37a032c65eec98cf0f38184bc3ce82d01426 Mon Sep 17 00:00:00 2001 From: ryanjameskennedy Date: Thu, 10 Oct 2024 14:01:30 +0200 Subject: [PATCH 9/9] Update CHANGELOG re emmtyper pytests --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index c05a59c..b398c8a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ ### Added - Added emmtyper and parser + - Added pytests for emmtyper ### Fixed