diff --git a/pgscatalog.match/pyproject.toml b/pgscatalog.match/pyproject.toml index 0133ce5..db67145 100644 --- a/pgscatalog.match/pyproject.toml +++ b/pgscatalog.match/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "pgscatalog.match" -version = "0.3.2" +version = "0.3.3" description = "Tools for matching variants in PGS scoring files and target variant information files" authors = ["Benjamin Wingfield ", "Samuel Lambert ", "Laurent Gil "] readme = "README.md" diff --git a/pgscatalog.match/src/pgscatalog/match/cli/_write.py b/pgscatalog.match/src/pgscatalog/match/cli/_write.py index 491e67f..5cc391b 100644 --- a/pgscatalog.match/src/pgscatalog/match/cli/_write.py +++ b/pgscatalog.match/src/pgscatalog/match/cli/_write.py @@ -6,11 +6,11 @@ import gzip import itertools -from pgscatalog.core import MatchRateError +from pgscatalog.core.lib.pgsexceptions import MatchRateError -from ..lib.plinkscorefiles import PlinkScoreFiles +from pgscatalog.match.lib.plinkscorefiles import PlinkScoreFiles -from ._config import Config +from pgscatalog.match.cli._config import Config def write_matches(matchresults, score_df): diff --git a/pgscatalog.match/src/pgscatalog/match/cli/match_cli.py b/pgscatalog.match/src/pgscatalog/match/cli/match_cli.py index 7f83cce..b144968 100644 --- a/pgscatalog.match/src/pgscatalog/match/cli/match_cli.py +++ b/pgscatalog.match/src/pgscatalog/match/cli/match_cli.py @@ -9,7 +9,7 @@ import polars as pl -from .. import ( +from pgscatalog.match import ( VariantFrame, ScoringFileFrame, match_variants, @@ -17,8 +17,8 @@ MatchResults, ) -from ._config import Config -from ._write import write_matches +from pgscatalog.match.cli._config import Config +from pgscatalog.match.cli._write import write_matches logger = logging.getLogger(__name__) diff --git a/pgscatalog.match/src/pgscatalog/match/cli/merge_cli.py b/pgscatalog.match/src/pgscatalog/match/cli/merge_cli.py index db5ba32..82566e8 100644 --- a/pgscatalog.match/src/pgscatalog/match/cli/merge_cli.py +++ b/pgscatalog.match/src/pgscatalog/match/cli/merge_cli.py @@ -4,10 +4,10 @@ import pathlib import shutil -from ._config import Config -from ._write import write_matches +from pgscatalog.match.cli._config import Config +from pgscatalog.match.cli._write import write_matches -from ..lib import ScoringFileFrame, MatchResult, MatchResults +from pgscatalog.match.lib import ScoringFileFrame, MatchResult, MatchResults import polars as pl diff --git a/pgscatalog.match/src/pgscatalog/match/lib/__init__.py b/pgscatalog.match/src/pgscatalog/match/lib/__init__.py index 97c0c6c..40737ed 100644 --- a/pgscatalog.match/src/pgscatalog/match/lib/__init__.py +++ b/pgscatalog.match/src/pgscatalog/match/lib/__init__.py @@ -1,9 +1,9 @@ import logging -from .variantframe import VariantFrame -from .scoringfileframe import ScoringFileFrame, match_variants -from .matchresult import MatchResult, MatchResults -from .plinkscorefiles import PlinkScoreFiles +from pgscatalog.match.lib.variantframe import VariantFrame +from pgscatalog.match.lib.scoringfileframe import ScoringFileFrame, match_variants +from pgscatalog.match.lib.matchresult import MatchResult, MatchResults +from pgscatalog.match.lib.plinkscorefiles import PlinkScoreFiles log_fmt = "%(name)s: %(asctime)s %(levelname)-8s %(message)s" logging.basicConfig(format=log_fmt, datefmt="%Y-%m-%d %H:%M:%S") diff --git a/pgscatalog.match/src/pgscatalog/match/lib/_arrow.py b/pgscatalog.match/src/pgscatalog/match/lib/_arrow.py index 1d970d6..24129f3 100644 --- a/pgscatalog.match/src/pgscatalog/match/lib/_arrow.py +++ b/pgscatalog.match/src/pgscatalog/match/lib/_arrow.py @@ -6,12 +6,15 @@ """ import pathlib import tempfile +from typing import Optional import polars as pl from functools import singledispatch -import polars.exceptions -from pgscatalog.core import NormalisedScoringFile, TargetVariants, TargetType +from polars.io.csv import BatchedCsvReader + +from pgscatalog.match.lib.normalisedscoringfile import NormalisedScoringFile +from pgscatalog.match.lib.targetvariants import TargetVariants, TargetType @singledispatch @@ -25,7 +28,7 @@ def loose(path, tmpdir=None): @loose.register -def _(path: NormalisedScoringFile, tmpdir=None): +def _(path: NormalisedScoringFile, tmpdir=None): # type: ignore """Write NormalisedScoringFiles to a list of arrow IPC files""" if tmpdir is None: tmpdir = tempfile.mkdtemp() @@ -47,7 +50,7 @@ def _(path: NormalisedScoringFile, tmpdir=None): @loose.register -def _(path: TargetVariants, tmpdir=None): +def _(path: TargetVariants, tmpdir=None): # type: ignore """Writes TargetVariants to a list of arrow IPC files""" if tmpdir is None: tmpdir = tempfile.mkdtemp() @@ -63,7 +66,14 @@ def _(path: TargetVariants, tmpdir=None): "column_5": pl.String, "column_6": pl.String, } - new_colnames = ["#CHROM", "ID", "CM", "POS", "REF", "ALT"] + new_colnames: Optional[list[str]] = [ + "#CHROM", + "ID", + "CM", + "POS", + "REF", + "ALT", + ] header = False comment = None case TargetType.PVAR: @@ -94,7 +104,9 @@ def _(path: TargetVariants, tmpdir=None): return batch_read(reader, tmpdir=tmpdir, cols_keep=cols_keep) -def batch_read(reader, tmpdir, cols_keep) -> list[pathlib.Path]: +def batch_read( + reader: BatchedCsvReader, tmpdir: pathlib.Path, cols_keep: list[str] +) -> list[pathlib.Path]: """Read a CSV in batches and write them to temporary files""" arrowpaths = [] # batch_size should be >= thread pool size, so tasks will be distributed amongst workers diff --git a/pgscatalog.match/src/pgscatalog/match/lib/_match/label.py b/pgscatalog.match/src/pgscatalog/match/lib/_match/label.py index 2b048c7..e070119 100644 --- a/pgscatalog.match/src/pgscatalog/match/lib/_match/label.py +++ b/pgscatalog.match/src/pgscatalog/match/lib/_match/label.py @@ -12,18 +12,18 @@ import polars as pl from xopen import xopen -from .preprocess import complement_valid_alleles +from pgscatalog.match.lib._match.preprocess import complement_valid_alleles logger = logging.getLogger(__name__) def label_matches( df: pl.LazyFrame, - keep_first_match, - remove_ambiguous, - remove_multiallelic, - skip_flip, - filter_IDs, + keep_first_match: bool, + remove_ambiguous: bool, + remove_multiallelic: bool, + skip_flip: bool, + filter_IDs: pathlib.Path, ) -> pl.LazyFrame: """Label match candidates with additional metadata. Column definitions: @@ -220,7 +220,9 @@ def _label_duplicate_id(df: pl.LazyFrame, keep_first_match: bool) -> pl.LazyFram ) -def _label_biallelic_ambiguous(df: pl.LazyFrame, remove_ambiguous) -> pl.LazyFrame: +def _label_biallelic_ambiguous( + df: pl.LazyFrame, remove_ambiguous: bool +) -> pl.LazyFrame: """ Identify ambiguous variants (A/T & C/G SNPs) diff --git a/pgscatalog.match/src/pgscatalog/match/lib/_plinkframe.py b/pgscatalog.match/src/pgscatalog/match/lib/_plinkframe.py index d0f5f89..dafbabb 100644 --- a/pgscatalog.match/src/pgscatalog/match/lib/_plinkframe.py +++ b/pgscatalog.match/src/pgscatalog/match/lib/_plinkframe.py @@ -6,7 +6,7 @@ import gzip import pathlib -from ._match.plink import plinkify, pivot_score +from pgscatalog.match.lib._match.plink import plinkify, pivot_score class PlinkFrame: diff --git a/pgscatalog.match/src/pgscatalog/match/lib/matchresult.py b/pgscatalog.match/src/pgscatalog/match/lib/matchresult.py index 532f02a..62acf5a 100644 --- a/pgscatalog.match/src/pgscatalog/match/lib/matchresult.py +++ b/pgscatalog.match/src/pgscatalog/match/lib/matchresult.py @@ -3,12 +3,12 @@ import polars as pl -from pgscatalog.core import ZeroMatchesError, MatchRateError +from pgscatalog.core.lib.pgsexceptions import ZeroMatchesError, MatchRateError -from ._plinkframe import PlinkFrames -from ._match.label import label_matches -from ._match.filter import filter_scores -from ._match.log import make_logs, check_log_count, make_summary_log +from pgscatalog.match.lib._plinkframe import PlinkFrames +from pgscatalog.match.lib._match.label import label_matches +from pgscatalog.match.lib._match.filter import filter_scores +from pgscatalog.match.lib._match.log import make_logs, check_log_count, make_summary_log logger = logging.getLogger(__name__) diff --git a/pgscatalog.match/src/pgscatalog/match/lib/normalisedscoringfile.py b/pgscatalog.match/src/pgscatalog/match/lib/normalisedscoringfile.py new file mode 100644 index 0000000..611b057 --- /dev/null +++ b/pgscatalog.match/src/pgscatalog/match/lib/normalisedscoringfile.py @@ -0,0 +1,57 @@ +import csv + +from xopen import xopen + +from pgscatalog.core.lib.models import ScoreVariant + + +def _read_normalised_rows(path): + with xopen(path) as f: + reader = csv.DictReader(f, delimiter="\t") + for row in reader: + yield ScoreVariant(**row) + + +class NormalisedScoringFile: + """This class represents a ScoringFile that's been normalised to have a consistent format + + Its main purpose is to provide a convenient way to iterate over variants + + # TODO: replace with a pydantic model in pgscatalog.core + """ + + def __init__(self, path): + try: + with xopen(path): + pass + except TypeError: + self.is_path = False + self.path = str(path) + else: + self.is_path = True + self.path = path + finally: + # either a ScoringFile or a path to a combined file + self._scoringfile = path + + def __iter__(self): + yield from self.variants + + @property + def variants(self): + if self.is_path: + # get a fresh generator from the file + self._variants = _read_normalised_rows(self._scoringfile) + else: + # get a fresh generator from the normalise() method + self._variants = self._scoringfile.normalise() + + return self._variants + + def __repr__(self): + if self.is_path: + x = f"{repr(str(self._scoringfile))}" + else: + x = f"{repr(self._scoringfile)}" + + return f"{type(self).__name__}({x})" diff --git a/pgscatalog.match/src/pgscatalog/match/lib/scoringfileframe.py b/pgscatalog.match/src/pgscatalog/match/lib/scoringfileframe.py index 2a0eca0..13277d0 100644 --- a/pgscatalog.match/src/pgscatalog/match/lib/scoringfileframe.py +++ b/pgscatalog.match/src/pgscatalog/match/lib/scoringfileframe.py @@ -4,12 +4,12 @@ import polars as pl -from pgscatalog.core import NormalisedScoringFile +from pgscatalog.match.lib.normalisedscoringfile import NormalisedScoringFile -from ._arrow import loose -from ._match.preprocess import complement_valid_alleles -from ._match.match import get_all_matches -from .matchresult import MatchResult +from pgscatalog.match.lib._arrow import loose +from pgscatalog.match.lib._match.preprocess import complement_valid_alleles +from pgscatalog.match.lib._match.match import get_all_matches +from pgscatalog.match.lib.matchresult import MatchResult logger = logging.getLogger(__name__) diff --git a/pgscatalog.match/src/pgscatalog/match/lib/targetvariants.py b/pgscatalog.match/src/pgscatalog/match/lib/targetvariants.py new file mode 100644 index 0000000..68103d7 --- /dev/null +++ b/pgscatalog.match/src/pgscatalog/match/lib/targetvariants.py @@ -0,0 +1,187 @@ +"""This module contains classes to work with target variants. When a scoring file is +being reused to calculate scores for new genotypes, the new genotypes are target +genomes.""" + +import enum +import csv +import pathlib + +from xopen import xopen + + +class TargetVariant: + """A single target variant, including genomic coordinates and allele information + + >>> a = TargetVariant(chrom="1", pos=12, ref="A", alt="C", id='1:12:A:C') + >>> a + TargetVariant(chrom='1', pos=12, ref='A', alt='C', id='1:12:A:C') + + >>> b = a + >>> b == a + True + """ + + def __init__(self, *, chrom, pos, ref, alt, id): + self.chrom = chrom + self.pos = int(pos) + self.ref = ref + self.alt = alt + self.id = id + + def __repr__(self): + return ( + f"{self.__class__.__name__}(chrom={repr(self.chrom)}, pos=" + f"{repr(self.pos)}, " + f"ref={repr(self.ref)}, " + f"alt={repr(self.alt)}, " + f"id={repr(self.id)})" + ) + + def __hash__(self): + return hash((self.chrom, self.pos, self.ref, self.alt, self.id)) + + def __eq__(self, other): + if isinstance(other, TargetVariant): + return (self.chrom, self.pos, self.ref, self.alt) == ( + other.chrom, + other.pos, + other.ref, + other.alt, + ) + return False + + +class TargetVariants: + """A container of :class:`TargetVariant` + :raises: FileNotFoundError + + >>> from pgscatalog.core import Config # ignore, only to load test data + >>> pvar = TargetVariants(Config.ROOT_DIR / "tests" / "data" / "hapnest.pvar") + >>> pvar.ftype + + + Iterating over TargetVariants is done via a read-only generator attribute: + >>> pvar.variants # doctest: +ELLIPSIS + + >>> for variant in pvar: + ... variant + ... break + TargetVariant(chrom='14', pos=65003549, ref='T', alt='C', id='14:65003549:T:C') + + gzip and zstandard compression is transparently handled for pvar: + >>> pvar = TargetVariants(Config.ROOT_DIR / "tests" / "data" / "hapnest.pvar.zst") + >>> for variant in pvar: + ... variant + ... break + TargetVariant(chrom='14', pos=65003549, ref='T', alt='C', id='14:65003549:T:C') + + The same is true for bim files: + >>> bim = TargetVariants(Config.ROOT_DIR / "tests" / "data" / "hapnest.bim.gz") + >>> bim.ftype + + + >>> for variant in bim: + ... variant + ... break + TargetVariant(chrom='1', pos=10180, ref='C', alt='T', id='1:10180:T:C') + + >>> bim = TargetVariants(Config.ROOT_DIR / "tests" / "data" / "hapnest.bim.zst") + >>> for variant in bim: + ... variant + ... break + TargetVariant(chrom='1', pos=10180, ref='C', alt='T', id='1:10180:T:C') + + >>> bim = TargetVariants(Config.ROOT_DIR / "tests" / "data" / "hapnest.bim") + >>> for variant in bim: + ... variant + ... break + TargetVariant(chrom='1', pos=10180, ref='C', alt='T', id='1:10180:T:C') + + Note, A1/A2 isn't guaranteed to be ref/alt because of PLINK1 file format + limitations. PGS Catalog libraries handle this internally, but you should be + aware REF/ALT can be swapped by plink during VCF to bim conversion. + Some pvar files can contain a lot of comments in the header, which are ignored: + >>> pvar = TargetVariants(Config.ROOT_DIR / "tests" / "data" / "1000G.pvar") + >>> for variant in pvar: + ... variant + ... break + TargetVariant(chrom='1', pos=10390, ref='CCCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCCTAA', alt='C', id='1:10390:CCCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCCTAA:C') + """ + + def __init__(self, path, chrom=None): + match n := pathlib.Path(path).name: + case _ if "pvar" in n: + self.ftype = TargetType.PVAR + case _ if "bim" in n: + self.ftype = TargetType.BIM + case _: + raise ValueError(f"Unknown target type {n!r}") + + self._chrom = chrom + self._path = str(path) + + def __repr__(self): + return f"{self.__class__.__name__}(path={repr(self.path)})" + + def __iter__(self): + yield from self.variants + + @property + def path(self): + return self._path + + @property + def chrom(self): + return self._chrom + + @property + def variants(self): + match self.ftype: + case TargetType.BIM: + return read_bim(self.path) + case TargetType.PVAR: + return read_pvar(self.path) + case _: + raise ValueError + + +def read_pvar(path): + """Read plink2 pvar variant information files using python core library""" + + with xopen(path, "rt") as f: + # pvars do have a header column and support arbitrary columns + for line in f: + if line.startswith("##"): + continue + else: + fieldnames = line.strip().split("\t") + break + reader = csv.DictReader(f, fieldnames=fieldnames, delimiter="\t") + fields = { + "#CHROM": "chrom", + "POS": "pos", + "REF": "ref", + "ALT": "alt", + "ID": "id", + } + for row in reader: + yield TargetVariant(**{v: row[k] for k, v in fields.items()}) + + +def read_bim(path): + """Read plink1 bim variant information files using python core library""" + with xopen(path, "rt") as f: + # bims don't have header column + reader = csv.reader(f, delimiter="\t") + # yes, A1/A2 in bim isn't ref/alt + fields = ["chrom", "id", "pos_cm", "pos", "ref", "alt"] + for row in reader: + row = dict(zip(fields, row, strict=True)) + yield TargetVariant( + **{k: row[k] for k in ("chrom", "pos", "ref", "alt", "id")} + ) + + +class TargetType(enum.Enum): + PVAR = enum.auto() + BIM = enum.auto() diff --git a/pgscatalog.match/src/pgscatalog/match/lib/variantframe.py b/pgscatalog.match/src/pgscatalog/match/lib/variantframe.py index f95cce9..a195c67 100644 --- a/pgscatalog.match/src/pgscatalog/match/lib/variantframe.py +++ b/pgscatalog.match/src/pgscatalog/match/lib/variantframe.py @@ -4,10 +4,10 @@ import polars as pl -from pgscatalog.core import TargetVariants +from pgscatalog.match.lib.targetvariants import TargetVariants -from ._arrow import loose -from ._match.preprocess import filter_target, annotate_multiallelic +from pgscatalog.match.lib._arrow import loose +from pgscatalog.match.lib._match.preprocess import filter_target, annotate_multiallelic logger = logging.getLogger(__name__) diff --git a/pgscatalog.match/tests/test_match_cli.py b/pgscatalog.match/tests/test_match_cli.py index d37e4fa..f039e99 100644 --- a/pgscatalog.match/tests/test_match_cli.py +++ b/pgscatalog.match/tests/test_match_cli.py @@ -6,7 +6,7 @@ from unittest.mock import patch import pytest -from pgscatalog.core import ZeroMatchesError +from pgscatalog.core.lib.pgsexceptions import ZeroMatchesError from pgscatalog.match.cli.match_cli import run_match diff --git a/pgscatalog.match/tests/test_merge_cli.py b/pgscatalog.match/tests/test_merge_cli.py index 1c7bd0c..3afbf3f 100644 --- a/pgscatalog.match/tests/test_merge_cli.py +++ b/pgscatalog.match/tests/test_merge_cli.py @@ -4,7 +4,7 @@ from glob import glob -from pgscatalog.core import ZeroMatchesError +from pgscatalog.core.lib.pgsexceptions import ZeroMatchesError from pgscatalog.match.cli.merge_cli import run_merge