diff --git a/notebooks/testingstuff.ipynb b/notebooks/testingstuff.ipynb index 28ebb9da..f3677c18 100644 --- a/notebooks/testingstuff.ipynb +++ b/notebooks/testingstuff.ipynb @@ -2,8 +2,13 @@ "cells": [ { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 2, + "metadata": { + "ExecuteTime": { + "end_time": "2024-01-30T20:55:12.659786Z", + "start_time": "2024-01-30T20:55:12.375781Z" + } + }, "outputs": [], "source": [ "import json\n", @@ -33,7 +38,9 @@ " replace_with_digest\n", ")\n", "def pretty_print(d: dict):\n", - " print(json.dumps(d, indent=2))" + " print(json.dumps(d, indent=2))\n", + "def pydantic_dict(o):\n", + " return o.dict(exclude_none=True)" ] }, { @@ -57,9 +64,22 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 3, + "metadata": { + "ExecuteTime": { + "end_time": "2024-01-30T20:55:12.664116Z", + "start_time": "2024-01-30T20:55:12.662083Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id=None label=None description=None extensions=None digest=None type='Allele' expressions=None location=SequenceLocation(id=None, label=None, description=None, extensions=None, digest=None, type='SequenceLocation', sequenceReference=None, start=55181319, end=55181320) state=LiteralSequenceExpression(id=None, label=None, description=None, extensions=None, digest=None, type='LiteralSequenceExpression', sequence=SequenceString(root='T'))\n" + ] + } + ], "source": [ "allele_dict = {\n", " 'location': {\n", @@ -99,9 +119,36 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 4, + "metadata": { + "ExecuteTime": { + "end_time": "2024-01-30T20:55:12.669479Z", + "start_time": "2024-01-30T20:55:12.664423Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"location\": {\n", + " \"type\": \"SequenceLocation\",\n", + " \"start\": 55181319,\n", + " \"end\": 55181320,\n", + " \"sequenceReference\": null,\n", + " \"digest\": \"5mvu29n_A07DBCqsGyrjk2NUknOhkVZS\"\n", + " },\n", + " \"state\": {\n", + " \"type\": \"LiteralSequenceExpression\",\n", + " \"sequence\": \"T\"\n", + " },\n", + " \"type\": \"Allele\",\n", + " \"digest\": \"eahJQ_NsonA4qMlGbBrFEJBlIMUenRLI\"\n", + "}\n" + ] + } + ], "source": [ "allele_identified = identify_all(allele)\n", "pretty_print(allele_identified)" @@ -109,9 +156,27 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 5, + "metadata": { + "ExecuteTime": { + "end_time": "2024-01-30T20:55:12.684277Z", + "start_time": "2024-01-30T20:55:12.672225Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Location serialized: b'{\"end\":55181320,\"sequenceReference\":null,\"start\":55181319,\"type\":\"SequenceLocation\"}'\n", + "Location digest: 5mvu29n_A07DBCqsGyrjk2NUknOhkVZS\n", + "Location digest: 5mvu29n_A07DBCqsGyrjk2NUknOhkVZS\n", + "Allele serialized: b'{\"location\":\"5mvu29n_A07DBCqsGyrjk2NUknOhkVZS\",\"state\":{\"sequence\":\"T\",\"type\":\"LiteralSequenceExpression\"},\"type\":\"Allele\"}'\n", + "Allele digest: eahJQ_NsonA4qMlGbBrFEJBlIMUenRLI\n", + "Allele digest: eahJQ_NsonA4qMlGbBrFEJBlIMUenRLI\n" + ] + } + ], "source": [ "location_serialized = ga4gh_serialize(allele.location)\n", "print(\"Location serialized: \" + str(location_serialized))\n", @@ -126,9 +191,22 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 6, + "metadata": { + "ExecuteTime": { + "end_time": "2024-01-30T20:55:12.709066Z", + "start_time": "2024-01-30T20:55:12.675406Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Average duration: 7.207540911622346e-05\n" + ] + } + ], "source": [ "ct = 100\n", "start = timer()\n", @@ -140,9 +218,22 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 7, + "metadata": { + "ExecuteTime": { + "end_time": "2024-01-30T20:55:12.710289Z", + "start_time": "2024-01-30T20:55:12.703490Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Average duration: 0.00015179874957539142\n" + ] + } + ], "source": [ "# genotype \n", "# https://www.ncbi.nlm.nih.gov/clinvar/variation/431013/\n", @@ -225,9 +316,22 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 8, + "metadata": { + "ExecuteTime": { + "end_time": "2024-01-30T20:55:12.740335Z", + "start_time": "2024-01-30T20:55:12.737050Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Average duration: 0.0002980487502645701\n" + ] + } + ], "source": [ "genotype_431013_dict = {\n", " \"type\": \"Genotype\",\n", @@ -261,8 +365,13 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 9, + "metadata": { + "ExecuteTime": { + "end_time": "2024-01-30T20:55:12.832604Z", + "start_time": "2024-01-30T20:55:12.740570Z" + } + }, "outputs": [], "source": [ "import ga4gh.vrs.extras.translator\n", @@ -272,35 +381,176 @@ "importlib.reload(ga4gh.vrs.dataproxy)\n", "importlib.reload(ga4gh.vrs)\n", "\n", + "from ga4gh.vrs import models\n", + "\n", "from ga4gh.vrs.extras.translator import Translator\n", "from ga4gh.vrs.dataproxy import SeqRepoDataProxy\n", "from biocommons.seqrepo import SeqRepo\n", "\n", "\n", - "data_proxy = SeqRepoDataProxy(SeqRepo(\"/Users/kferrite/dev/biocommons.seqrepo/seqrepo/2021-01-29\"))\n", - "translator = Translator(data_proxy=data_proxy)\n", - "# translator._from_beacon(\"13 : 32936732 G > C\")\n", - "# data_proxy.get_metadata('GRCh38:13')\n", - "spdi_383650 = 'NC_000009.12:128325834:C:T'\n", - "translator._from_spdi(spdi_383650)" + "data_proxy = SeqRepoDataProxy(SeqRepo(\"/usr/local/share/seqrepo/2021-01-29\"))\n", + "translator = Translator(data_proxy=data_proxy)\n" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 10, + "metadata": { + "ExecuteTime": { + "end_time": "2024-01-30T20:55:12.836510Z", + "start_time": "2024-01-30T20:55:12.833916Z" + } + }, "outputs": [], "source": [ - "pretty_print(allele_280320.model_dump(exclude_none=True))\n", - "ga4gh.vrs.normalize(allele_280320, data_proxy=data_proxy)" + "expansion_allele = models.Allele(\n", + " expressions=[\n", + " models.Expression(\n", + " syntax='spdi',\n", + " value='NC_000001.11:40819438:CTCCTCCT:CTCCTCCTCCT'\n", + " )\n", + " ],\n", + " location=models.SequenceLocation(\n", + " sequenceReference=models.SequenceReference(\n", + " refgetAccession='SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO',\n", + " residueAlphabet='na',\n", + " id='NC_000001.11:',\n", + " ),\n", + " start=40819438,\n", + " end=40819438),\n", + " state=models.LiteralSequenceExpression(\n", + " sequence='CTC'\n", + " )\n", + ")" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, "outputs": [], - "source": [] + "source": [ + "# This is expected to normalize to a fully-justified allele with a ReferenceLengthExpression\n", + "normalized = ga4gh.vrs.normalize(expansion_allele, data_proxy=data_proxy)" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-01-30T20:55:35.795398Z", + "start_time": "2024-01-30T20:55:35.785972Z" + } + }, + "execution_count": 12 + }, + { + "cell_type": "code", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"type\": \"Allele\",\n", + " \"expressions\": [\n", + " {\n", + " \"syntax\": \"spdi\",\n", + " \"value\": \"NC_000001.11:40819438:CTCCTCCT:CTCCTCCTCCT\"\n", + " }\n", + " ],\n", + " \"location\": {\n", + " \"type\": \"SequenceLocation\",\n", + " \"sequenceReference\": {\n", + " \"id\": \"NC_000001.11:\",\n", + " \"type\": \"SequenceReference\",\n", + " \"refgetAccession\": \"SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO\",\n", + " \"residueAlphabet\": \"na\"\n", + " },\n", + " \"start\": 40819438,\n", + " \"end\": 40819446\n", + " },\n", + " \"state\": {\n", + " \"type\": \"ReferenceLengthExpression\",\n", + " \"length\": 11,\n", + " \"sequence\": \"CTCCTCCTCCT\",\n", + " \"repeatSubunitLength\": 3\n", + " }\n", + "}\n" + ] + } + ], + "source": [ + "pretty_print(pydantic_dict(normalized))" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-01-30T20:55:44.585796Z", + "start_time": "2024-01-30T20:55:44.579823Z" + } + }, + "execution_count": 13 + }, + { + "cell_type": "code", + "outputs": [ + { + "data": { + "text/plain": "'ga4gh:VA.L1iW0hEkuerURCtRni6HdEnEIdLoGvog'" + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ga4gh_identify(normalized)" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-01-30T21:11:45.983540Z", + "start_time": "2024-01-30T21:11:45.980918Z" + } + }, + "execution_count": 24 + }, + { + "cell_type": "code", + "outputs": [], + "source": [ + "# Descriptive properties do not affect digest\n", + "normalized.label = \"test\"" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-01-30T21:13:11.467490Z", + "start_time": "2024-01-30T21:13:11.466190Z" + } + }, + "execution_count": 26 + }, + { + "cell_type": "code", + "outputs": [ + { + "data": { + "text/plain": "'ga4gh:VA.L1iW0hEkuerURCtRni6HdEnEIdLoGvog'" + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ga4gh_identify(normalized)" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-01-30T21:13:31.090021Z", + "start_time": "2024-01-30T21:13:31.086682Z" + } + }, + "execution_count": 28 } ], "metadata": { diff --git a/src/ga4gh/core/__init__.py b/src/ga4gh/core/__init__.py index c2de1300..f013a087 100644 --- a/src/ga4gh/core/__init__.py +++ b/src/ga4gh/core/__init__.py @@ -13,7 +13,7 @@ parse_ga4gh_identifier, GA4GHComputeIdentifierWhen, use_ga4gh_compute_identifier_when ) from ._internal.pydantic import ( - is_pydantic_instance, is_curie_type, is_identifiable, is_literal, pydantic_copy + is_pydantic_instance, is_curie_type, is_ga4gh_identifiable, is_literal, pydantic_copy ) from ._internal import models as core_models diff --git a/src/ga4gh/core/_internal/enderef.py b/src/ga4gh/core/_internal/enderef.py index 1f4c4001..35090de0 100644 --- a/src/ga4gh/core/_internal/enderef.py +++ b/src/ga4gh/core/_internal/enderef.py @@ -15,7 +15,7 @@ is_pydantic_instance, is_list, is_curie_type, - is_identifiable, + is_ga4gh_identifiable, get_pydantic_root, pydantic_copy) @@ -60,7 +60,7 @@ def _enref(o): if not is_pydantic_instance(o): raise ValueError("Called ga4gh_enref() with non-pydantic instance") - if not is_identifiable(o): + if not is_ga4gh_identifiable(o): raise ValueError("Called ga4gh_enref() with non-identifiable object") # in-place replacement on object copy @@ -101,7 +101,7 @@ def _deref(o): if not is_pydantic_instance(o): raise ValueError("Called ga4gh_deref() with non-non-pydantic instance") - if not is_identifiable(o): + if not is_ga4gh_identifiable(o): raise ValueError("Called ga4gh_deref() with non-identifiable object") # in-place replacement on object copy diff --git a/src/ga4gh/core/_internal/identifiers.py b/src/ga4gh/core/_internal/identifiers.py index bdf1b0b3..160ec0f2 100644 --- a/src/ga4gh/core/_internal/identifiers.py +++ b/src/ga4gh/core/_internal/identifiers.py @@ -28,7 +28,7 @@ from .pydantic import ( is_pydantic_instance, is_curie_type, - is_identifiable, + is_ga4gh_identifiable, getattr_in, get_pydantic_root, is_pydantic_custom_type @@ -143,7 +143,7 @@ def ga4gh_identify(vro): 'ga4gh:VSL.u5fspwVbQ79QkX6GHLF8tXPCAXFJqRPx' """ - if is_identifiable(vro): + if is_ga4gh_identifiable(vro): when_rule = ga4gh_compute_identifier_when.get(GA4GHComputeIdentifierWhen.ALWAYS) do_compute = False ir = None @@ -281,7 +281,7 @@ def identify_all( if is_pydantic_custom_type(input_obj): val = export_pydantic_model(input_obj) - if isinstance(val, str) and is_curie_type(val) and is_ga4gh_identifier(val): + if isinstance(val, str) and is_ga4gh_identifier(val): val = parse_ga4gh_identifier(val)["digest"] output_obj = val elif is_pydantic_instance(input_obj): @@ -307,7 +307,7 @@ def identify_all( # Assumes any obj with 'digest' should be collapsed. collapsed_output_obj = collapse_identifiable_values(output_obj) # Add a digest to the output if it is identifiable - if is_identifiable(input_obj): + if is_ga4gh_identifiable(input_obj): # Compute digest for updated object, not re-running compaction output_obj["digest"] = ga4gh_digest(collapsed_output_obj, do_compact=False) else: @@ -317,22 +317,22 @@ def identify_all( return output_obj -def scrape_model_metadata(obj, meta={}) -> dict: - """ - For a Pydantic object obj, pull out .ga4gh.identifiable - and .ga4gh.keys and put them in meta keyed by the class name of obj - """ - assert isinstance(obj, BaseModel) - name = type(obj).__name__ - if is_pydantic_custom_str_type(obj): - meta[name] = {"identifiable": False, "keys": None} - else: - meta[name] = {} - identifiable = getattr_in(obj, ["ga4gh", "identifiable"]) - if identifiable: - meta[name]["identifiable"] = identifiable - keys = getattr_in(obj, ["ga4gh", "keys"]) - if keys and len(keys) > 0: - meta[name]["keys"] = keys - # TODO recurse into fields - return meta +# def scrape_model_metadata(obj, meta={}) -> dict: +# """ +# For a Pydantic object obj, pull out .ga4gh.identifiable +# and .ga4gh.keys and put them in meta keyed by the class name of obj +# """ +# assert isinstance(obj, BaseModel) +# name = type(obj).__name__ +# if is_pydantic_custom_str_type(obj): +# meta[name] = {"identifiable": False, "keys": None} +# else: +# meta[name] = {} +# identifiable = getattr_in(obj, ["ga4gh", "identifiable"]) +# if identifiable: +# meta[name]["identifiable"] = identifiable +# keys = getattr_in(obj, ["ga4gh", "keys"]) +# if keys and len(keys) > 0: +# meta[name]["keys"] = keys +# # TODO recurse into fields +# return meta diff --git a/src/ga4gh/core/_internal/pydantic.py b/src/ga4gh/core/_internal/pydantic.py index 473e6c76..85e70192 100644 --- a/src/ga4gh/core/_internal/pydantic.py +++ b/src/ga4gh/core/_internal/pydantic.py @@ -21,15 +21,15 @@ def getattr_in(obj, names) -> Any: return v -def is_identifiable(o: Any) -> bool: +def is_ga4gh_identifiable(o: Any) -> bool: """ - Determine if object is identifiable. An object is considered identifiable if - contains a `ga4gh_digest` attribute + Determine if object is GA4GH identifiable. An object is considered + GA4GH identifiable if it contains a `ga4gh_prefix` attribute :param o: Object - :return: `True` if `o` has `ga4gh_digest` attribute. `False` otherwise. + :return: `True` if `o` has `ga4gh_prefix` attribute. `False` otherwise. """ - return getattr_in(o, ['ga4gh', 'identifiable']) + return bool(getattr_in(o, ['ga4gh', 'prefix'])) def is_literal(o: Any) -> bool: diff --git a/src/ga4gh/vrs/_internal/models.py b/src/ga4gh/vrs/_internal/models.py index e928473d..6a8767b5 100644 --- a/src/ga4gh/vrs/_internal/models.py +++ b/src/ga4gh/vrs/_internal/models.py @@ -26,7 +26,7 @@ from pydantic import BaseModel, ConfigDict, Field, RootModel, constr from ga4gh.core._internal.pydantic import ( - is_identifiable, + is_ga4gh_identifiable, getattr_in ) from ga4gh.core._internal.models import IRI, _Entity @@ -91,7 +91,7 @@ def pydantic_class_refatt_map(): # Types directly reffable reffable_classes = list(filter( lambda c: ('id' in c.model_fields - and is_identifiable(c)), + and is_ga4gh_identifiable(c)), model_classes )) # Types reffable because they are a union of reffable types @@ -172,16 +172,17 @@ class _ValueObject(_Entity): description='A sha512t24u digest created using the VRS Computed Identifier algorithm.', ) + class ga4gh: + keys: List[str] + class _Ga4ghIdentifiableObject(_ValueObject): """A contextual value object for which a GA4GH computed identifier can be created.""" type: str - class ga4gh: - identifiable = True + class ga4gh(_ValueObject.ga4gh): prefix: str - keys: List[str] class Expression(BaseModel): @@ -239,7 +240,7 @@ class SequenceReference(_ValueObject): ) residueAlphabet: Optional[ResidueAlphabet] = None - class ga4gh: + class ga4gh(_ValueObject.ga4gh): assigned: bool = Field( True, description='This special property indicates that the `digest` field follows an alternate convention and is expected to have the value assigned following that convention. For SequenceReference, it is expected the digest will be the refget accession value without the `SQ.` prefix.' @@ -262,6 +263,13 @@ class ReferenceLengthExpression(_ValueObject): None, description='The number of residues in the repeat subunit.' ) + class ga4gh(_ValueObject.ga4gh): + keys = [ + 'length', + 'repeatSubunitLength', + 'type' + ] + class LiteralSequenceExpression(_ValueObject): """An explicit expression of a Sequence.""" @@ -271,6 +279,12 @@ class LiteralSequenceExpression(_ValueObject): ) sequence: SequenceString = Field(..., description='the literal sequence') + class ga4gh(_ValueObject.ga4gh): + keys = [ + 'sequence', + 'type' + ] + class SequenceLocation(_Ga4ghIdentifiableObject): """A `Location` defined by an interval on a referenced `Sequence`.""" @@ -407,7 +421,6 @@ class GenotypeMember(_ValueObject): ) class ga4gh(_Ga4ghIdentifiableObject.ga4gh): - identifiable = False keys = [ 'type', 'count', diff --git a/tests/extras/data/test_vcf_expected_altsonly_output.vcf.gz b/tests/extras/data/test_vcf_expected_altsonly_output.vcf.gz index f1f0b18a..cf73bc98 100644 Binary files a/tests/extras/data/test_vcf_expected_altsonly_output.vcf.gz and b/tests/extras/data/test_vcf_expected_altsonly_output.vcf.gz differ diff --git a/tests/extras/data/test_vcf_expected_output.vcf.gz b/tests/extras/data/test_vcf_expected_output.vcf.gz index e6d1e7d3..7317e933 100644 Binary files a/tests/extras/data/test_vcf_expected_output.vcf.gz and b/tests/extras/data/test_vcf_expected_output.vcf.gz differ diff --git a/tests/extras/data/test_vcf_expected_output_no_vrs_attrs.vcf.gz b/tests/extras/data/test_vcf_expected_output_no_vrs_attrs.vcf.gz index b5e17a02..e886f1b9 100644 Binary files a/tests/extras/data/test_vcf_expected_output_no_vrs_attrs.vcf.gz and b/tests/extras/data/test_vcf_expected_output_no_vrs_attrs.vcf.gz differ diff --git a/tests/extras/test_allele_translator.py b/tests/extras/test_allele_translator.py index 4e7b0fa6..a249cea0 100644 --- a/tests/extras/test_allele_translator.py +++ b/tests/extras/test_allele_translator.py @@ -111,7 +111,7 @@ def tlr(rest_dataproxy): insertion_output = { "location": { "end": 20003010, - "start":20003010, + "start": 20003010, "sequenceReference": { "refgetAccession": "SQ._0wi-qoDrvram155UmcSC-zA5ZK4fpLT", "type": "SequenceReference" @@ -265,6 +265,7 @@ def test_to_spdi(tlr): assert 1 == len(to_spdi) assert spdiexpr == to_spdi[0] + hgvs_tests = ( ("NC_000013.11:g.32936732=", { "id": "ga4gh:VA.GuDPEe-WojSx4b4DxupN3si1poaR61qL", @@ -303,7 +304,7 @@ def test_to_spdi(tlr): "type": "Allele" }), ("NC_000007.14:g.55181220del", { - "id": "ga4gh:VA.BNV6SfAuqDYKTTRknLcS-QuTryF5rSBi", + "id": "ga4gh:VA.wlYnlMsWc0ZTPZb-nQv2dXHbFcXa6J9u", "location": { "id": "ga4gh:SL.hnIOG_kul0Lf3mO1ddTRFb0GbQhtQ19t", "end": 55181220, @@ -341,7 +342,7 @@ def test_to_spdi(tlr): "type": "Allele" }), ("NC_000013.11:g.32331093_32331094dup", { - "id": "ga4gh:VA.g-q4OzcyYFC5eVQFSrbXwgJScSREvrY-", + "id": "ga4gh:VA.x5iNzjjXbb1-wWTBLMBcicYlCMwYoedq", "location": { "id": "ga4gh:SL.PJ8lHWhAMNRSrxHvkarfDjRWxF-GwaJ_", "end": 32331094, @@ -361,7 +362,7 @@ def test_to_spdi(tlr): "type": "Allele" }), ("NC_000013.11:g.32316467dup", { - "id": "ga4gh:VA._KlbF6GZCbuLxbL9z4hZE3oZSLzBHstS", + "id": "ga4gh:VA.ZAyA7Mmd7ERWN6CEd6muxn2mk_gTvEvF", "location": { "id": "ga4gh:SL.LURTeRdwh5bQf_QqPBoaA--MECYmrY5U", "end": 32316467, @@ -434,7 +435,7 @@ def test_hgvs(tlr, hgvsexpr, expected): def test_to_hgvs_invalid(tlr): # IRI is passed - iri_vo = models.Allele( + iri_vo = models.Allele( **{ "location": { "end": 1263, diff --git a/tests/extras/test_vcf_annotation.py b/tests/extras/test_vcf_annotation.py index c39ffe64..1b134794 100644 --- a/tests/extras/test_vcf_annotation.py +++ b/tests/extras/test_vcf_annotation.py @@ -8,10 +8,12 @@ TEST_DATA_DIR = "tests/extras/data" + @pytest.fixture def vcf_annotator(): return VCFAnnotator("rest") + @pytest.mark.vcr def test_annotate_vcf_grch38_noattrs(vcf_annotator, vcr_cassette): vcr_cassette.allow_playback_repeats = False @@ -26,12 +28,14 @@ def test_annotate_vcf_grch38_noattrs(vcf_annotator, vcr_cassette): out_vcf_lines = out_vcf.readlines() with gzip.open(expected_vcf_no_vrs_attrs, "rt") as expected_output: expected_output_lines = expected_output.readlines() - assert out_vcf_lines == expected_output_lines + for actual_line, expected_line in zip(out_vcf_lines, expected_output_lines): + assert actual_line == expected_line assert os.path.exists(output_vrs_pkl) assert vcr_cassette.all_played os.remove(output_vcf) os.remove(output_vrs_pkl) + @pytest.mark.vcr def test_annotate_vcf_grch38_attrs(vcf_annotator, vcr_cassette): vcr_cassette.allow_playback_repeats = False @@ -46,12 +50,14 @@ def test_annotate_vcf_grch38_attrs(vcf_annotator, vcr_cassette): out_vcf_lines = out_vcf.readlines() with gzip.open(expected_vcf, "rt") as expected_output: expected_output_lines = expected_output.readlines() - assert out_vcf_lines == expected_output_lines + for actual_line, expected_line in zip(out_vcf_lines, expected_output_lines): + assert actual_line == expected_line assert os.path.exists(output_vrs_pkl) assert vcr_cassette.all_played os.remove(output_vcf) os.remove(output_vrs_pkl) + @pytest.mark.vcr def test_annotate_vcf_grch38_attrs_altsonly(vcf_annotator, vcr_cassette): vcr_cassette.allow_playback_repeats = False @@ -66,12 +72,14 @@ def test_annotate_vcf_grch38_attrs_altsonly(vcf_annotator, vcr_cassette): out_vcf_lines = out_vcf.readlines() with gzip.open(expected_altsonly_vcf, "rt") as expected_output: expected_output_lines = expected_output.readlines() - assert out_vcf_lines == expected_output_lines + for actual_line, expected_line in zip(out_vcf_lines, expected_output_lines): + assert actual_line == expected_line assert os.path.exists(output_vrs_pkl) assert vcr_cassette.all_played os.remove(output_vcf) os.remove(output_vrs_pkl) + @pytest.mark.vcr def test_annotate_vcf_grch37_attrs(vcf_annotator, vcr_cassette): vcr_cassette.allow_playback_repeats = False @@ -92,6 +100,7 @@ def test_annotate_vcf_grch37_attrs(vcf_annotator, vcr_cassette): os.remove(output_vcf) os.remove(output_vrs_pkl) + @pytest.mark.vcr def test_annotate_vcf_pickle_only(vcf_annotator, vcr_cassette): vcr_cassette.allow_playback_repeats = False @@ -106,6 +115,7 @@ def test_annotate_vcf_pickle_only(vcf_annotator, vcr_cassette): assert vcr_cassette.all_played os.remove(output_vrs_pkl) + @pytest.mark.vcr def test_annotate_vcf_vcf_only(vcf_annotator, vcr_cassette): vcr_cassette.allow_playback_repeats = False @@ -125,6 +135,7 @@ def test_annotate_vcf_vcf_only(vcf_annotator, vcr_cassette): assert not os.path.exists(output_vrs_pkl) os.remove(output_vcf) + def test_annotate_vcf_input_validation(vcf_annotator): input_vcf = f"{TEST_DATA_DIR}/test_vcf_input.vcf" @@ -132,6 +143,7 @@ def test_annotate_vcf_input_validation(vcf_annotator): vcf_annotator.annotate(input_vcf) assert str(e.value) == "Must provide one of: `vcf_out` or `vrs_pickle_out`" + @pytest.mark.vcr def test_get_vrs_object_invalid_input(vcf_annotator, caplog): """Test that _get_vrs_object method works as expected with invalid input"""