diff --git a/.gitignore b/.gitignore
index a595dcfe..96198474 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,3 +11,5 @@ assets/report/renv/
assets/report/report.Rproj
.Rprofile
tests/.venv/
+docs/.venv/
+docs/_build
\ No newline at end of file
diff --git a/CITATIONS.md b/CITATIONS.md
index 128445bc..7cba0759 100644
--- a/CITATIONS.md
+++ b/CITATIONS.md
@@ -33,9 +33,6 @@
* [Bioconda](https://pubmed.ncbi.nlm.nih.gov/29967506/)
> Grüning B, Dale R, Sjödin A, Chapman BA, Rowe J, Tomkins-Tinch CH, Valieris R, Köster J; Bioconda Team. Bioconda: sustainable and comprehensive software distribution for the life sciences. Nat Methods. 2018 Jul;15(7):475-476. doi: 10.1038/s41592-018-0046-7. PubMed PMID: 29967506.
-* [BioContainers](https://pubmed.ncbi.nlm.nih.gov/28379341/)
- > da Veiga Leprevost F, Grüning B, Aflitos SA, Röst HL, Uszkoreit J, Barsnes H, Vaudel M, Moreno P, Gatto L, Weber J, Bai M, Jimenez RC, Sachsenberg T, Pfeuffer J, Alvarez RV, Griss J, Nesvizhskii AI, Perez-Riverol Y. BioContainers: an open-source and community-driven framework for software standardization. Bioinformatics. 2017 Aug 15;33(16):2580-2582. doi: 10.1093/bioinformatics/btx192. PubMed PMID: 28379341; PubMed Central PMCID: PMC5870671.
-
* [Docker](https://dl.acm.org/doi/10.5555/2600239.2600241)
* [Singularity](https://pubmed.ncbi.nlm.nih.gov/28494014/)
diff --git a/assets/report/report.qmd b/assets/report/report.qmd
index b608ae49..5c7a69ac 100644
--- a/assets/report/report.qmd
+++ b/assets/report/report.qmd
@@ -443,7 +443,7 @@ uscores_plot <- uscores[1:min(length(uscores), 6)] # plot max 6 PGS
# Plot multiple adjustment methods at once per PGS
for(current_pgs in uscores_plot){
- long_scores <- scores %>% select(!percentile_MostSimilarPop) %>% filter(PGS == current_pgs) %>% gather(Method, score, -sampleset, -IID, -PGS)
+ long_scores <- scores %>% select(!percentile_MostSimilarPop) %>% filter(PGS == current_pgs) %>% gather(Method, score, -sampleset, -FID, -IID, -PGS)
long_scores %>%
ggplot(aes(x = score, fill = sampleset)) +
geom_density(alpha = 0.3) +
diff --git a/conf/modules.config b/conf/modules.config
index 8c341266..b422c3fe 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -38,12 +38,12 @@ process {
ext.conda = "$projectDir/environments/pgscatalog_utils/environment.yml"
ext.docker = 'ghcr.io/pgscatalog/pygscatalog'
ext.singularity = 'oras://ghcr.io/pgscatalog/pygscatalog'
- ext.docker_version = ':pgscatalog-utils-1.1.2'
- ext.singularity_version = ':pgscatalog-utils-1.1.2-singularity'
+ ext.docker_version = ':pgscatalog-utils-1.2.0'
+ ext.singularity_version = ':pgscatalog-utils-1.2.0-singularity'
}
withLabel: plink2 {
- ext.conda = "bioconda::plink2==2.00a5.10"
+ ext.conda = "$projectDir/environments/plink2/environment.yml"
ext.docker = 'ghcr.io/pgscatalog/plink2'
ext.singularity = 'oras://ghcr.io/pgscatalog/plink2'
ext.docker_version = ':2.00a5.10'
@@ -52,22 +52,22 @@ process {
withLabel: zstd {
ext.conda = "$projectDir/environments/zstd/environment.yml"
- ext.singularity = 'oras://dockerhub.ebi.ac.uk/gdp-public/pgsc_calc/singularity/zstd'
- ext.singularity_version = ':amd64-1.4.8'
- ext.docker = 'dockerhub.ebi.ac.uk/gdp-public/pgsc_calc/zstd'
- ext.docker_version = ':1.4.8'
+ ext.singularity = 'oras://ghcr.io/pgscatalog/zstd'
+ ext.singularity_version = ':2-beta-singularity'
+ ext.docker = 'ghcr.io/pgscatalog/zstd'
+ ext.docker_version = ':2-beta'
}
withLabel: report {
ext.conda = "$projectDir/environments/report/environment.yml"
- ext.singularity = 'oras://dockerhub.ebi.ac.uk/gdp-public/pgsc_calc/singularity/report'
- ext.singularity_version = ':2.00a5'
- ext.docker = 'dockerhub.ebi.ac.uk/gdp-public/pgsc_calc/report'
- ext.docker_version = ':2.00a5'
+ ext.singularity = 'oras://ghcr.io/pgscatalog/report'
+ ext.singularity_version = ':2-beta-singularity'
+ ext.docker = 'ghcr.io/pgscatalog/report'
+ ext.docker_version = ':2-beta'
}
withLabel: pyyaml {
- ext.conda = "conda-forge::pyyaml==6.0"
+ ext.conda = "$projectDir/environments/pyyaml/environment.yml"
ext.singularity = 'oras://ghcr.io/pgscatalog/pyyaml'
ext.singularity_version = ':6.0-singularity'
ext.docker = 'ghcr.io/pgscatalog/pyyaml'
@@ -77,9 +77,9 @@ process {
withLabel: fraposa {
ext.conda = "$projectDir/environments/fraposa/environment.yml"
ext.singularity = 'oras://ghcr.io/pgscatalog/fraposa_pgsc'
- ext.singularity_version = ':v0.1.0-singularity'
+ ext.singularity_version = ':v1.0.0-singularity'
ext.docker = 'ghcr.io/pgscatalog/fraposa_pgsc'
- ext.docker_version = ':v0.1.1'
+ ext.docker_version = ':v1.0.0'
}
// output configuration
diff --git a/docs/_templates/globaltoc.html b/docs/_templates/globaltoc.html
index f54562ec..b7cad074 100644
--- a/docs/_templates/globaltoc.html
+++ b/docs/_templates/globaltoc.html
@@ -22,7 +22,7 @@
Contents
About the project
@@ -35,7 +35,7 @@ Useful links
Issue tracker
Discussion board
- pgscatalog_utils Github
+ pgscatalog-utils GitHub
diff --git a/docs/changelog.rst b/docs/changelog.rst
deleted file mode 100644
index 676c6a06..00000000
--- a/docs/changelog.rst
+++ /dev/null
@@ -1,332 +0,0 @@
-:orphan:
-
-Changelog
----------
-
-Versions follow `semantic versioning`_ (``major.minor.patch``). Breaking changes
-will only occur in major versions with changes noted in this changelog.
-
-.. _`semantic versioning`: https://semver.org/
-
-pgsc_calc v2.0.0-beta (2024-06-19)
--------------------------------------
-
-Graduating to beta with the release of `our preprint `_ 🎉
-
-Improvements
-
-* https://github.com/PGScatalog/pygscatalog/pull/23
-
-* https://github.com/PGScatalog/pygscatalog/pull/22
-
-* https://github.com/PGScatalog/pgsc_calc/pull/311
-
-* Publish dependencies to Bioconda to improve conda profile UX
-
- * https://anaconda.org/bioconda/fraposa-pgsc
-
- * https://anaconda.org/bioconda/pgscatalog.core
-
- * https://anaconda.org/bioconda/pgscatalog.match
-
- * https://anaconda.org/bioconda/pgscatalog.calc
-
-Bug fixes
-
-* Fix for https://github.com/PGScatalog/pygscatalog/issues/21
-
-* Closes https://github.com/PGScatalog/pgsc_calc/pull/301
-
-* Specify modules explicitly to fix https://github.com/PGScatalog/pgsc_calc/pull/312
-
-* Fix bim input to `pgscatalog-aggregate` https://github.com/PGScatalog/pgsc_calc/pull/319
-
-
-pgsc_calc v2.0.0-alpha.6 (2024-05-24)
--------------------------------------
-
-Please note the minimum required nextflow version has been updated to v23.10.0, released in October 2023.
-
-Improvements
-
-* Migrate our custom python tools to new https://github.com/PGScatalog/pygscatalog packages
-
- * Reference / target intersection now considers allelic frequency and variant missingness to determine PCA eligibility
-
- * Downloads from PGS Catalog should be faster (async)
-
- * Package CLI and libraries `are now documented `_
-
-* Update plink version to alpha 5.10 final
-
-* Add docs describing cloud execution
-
-* Add correlation test comparing calculated scores against known good scores
-
-* When matching variants, matching logs are now written before scorefiles to improve debugging UX
-
-* Improvements to PCA quality (ensuring low missingness and suitable MAF for PCA-eligble variants in target samples).
-
- * This could allow us to implement MAF/missingness filters for scoring file variants in the future.
-
-Bug fixes
-
-* Fix ancestry adjustment with VCFs
-* Fix support for scoring files that only have one effect type column
-* Fix adjusting PGS with zero variance (skip them)
-* Check for reserved characters in sampleset names
-
-pgsc_calc v2.0.0-alpha.5 (2024-03-19)
--------------------------------------
-
-Improvements:
-
-* Automatically mount directories inside singularity containers without setting any configuration
-* Improve permanent caching of ancestry processes with --genotypes_cache parameter
-* resync with nf-core framework
-* Refactor combine_scorefiles
-
-Bug fixes:
-
-* Fix semantic storeDir definitions causing problems cloud execution (google batch)
-* Fix missing DENOM values with multiple custom scoring files (score calculation not affected)
-* Fix liftover failing silently with custom scoring files (thanks Brooke!)
-
-Misc:
-
-* Move aggregation step out of report
-
-pgsc_calc v2.0.0-alpha.4 (2023-12-05)
--------------------------------------
-
-Improvements:
-
-* Give a more helpful error message when there's no valid variant matches found
-
-Bug fixes:
-
-* Fix retrying downloads from PGS Catalog
-* Fix numeric sample identifiers breaking ancestry analysis
-* Check for chr prefix in samplesheets and error
-
-pgsc_calc v2.0.0-alpha.3 (2023-10-02)
--------------------------------------
-
-Improvements:
-
-* Automatically retry scoring with more RAM on larger datasets
-* Describe scoring precision in docs
-* Change handling of VCFs to reduce errors when recoding
-* Internal changes to improve support for custom reference panels
-
-Bug fixes:
-
-* Fix VCF input to ancestry projection subworkflow (thanks `@frahimov`_ and `@AWS-crafter`_ for patiently debugging)
-* Fix scoring options when reading allelic frequencies from a reference panel (thanks `@raimondsre`_ for reporting the changes from v1.3.2 -> 2.0.0-alpha)
-* Fix conda profile action
-
-.. _`@frahimov`: https://github.com/PGScatalog/pgsc_calc/issues/172
-.. _`@AWS-crafter`: https://github.com/PGScatalog/pgsc_calc/issues/155
-.. _`@raimondsre`: https://github.com/PGScatalog/pgsc_calc/pull/139#issuecomment-1736313211
-
-pgsc_calc v2.0.0-alpha.1 (2023-08-11)
--------------------------------------
-
-This patch fixes a bug when running the workflow directly from github with the
-test profile (i.e. without cloning first). Thanks to `@staedlern`_ for reporting the
-problem.
-
-.. _`@staedlern`: https://github.com/PGScatalog/pgsc_calc/issues/151
-
-pgsc_calc v2.0.0-alpha (2023-08-08)
------------------------------------
-
-This major release features breaking changes to samplesheet structure to provide
-more flexible support for extra genomic file types in the future. Two major new
-features were implemented in this release:
-
-- Genetic ancestry group similarity is calculated to a population reference panel
- (default: 1000 Genomes) when the ``--run_ancestry`` flag is supplied. This runs
- using PCA and projection implemented in the ``fraposa_pgsc (v0.1.0)`` package.
-- Calculated PGS can be adjusted for genetic ancestry using empirical PGS distributions
- from the most similar reference panel population or continuous PCA-based regressions.
-
-These new features are optional and don't run in the default workflow. Other features
-included in the release are:
-
-- Speed optimizations for PGS scoring (skipping allele frequency calculation)
-
-pgsc_calc v1.3.2 (2023-01-27)
------------------------------
-
-This patch fixes a bug that made some PGS Catalog scoring files incompatible
-with the pipeline. Effect weights were sometimes set to utf-8 strings instead of
-floating point numbers, which caused an assertion error. Thanks to `@j0n-a`_ for
-reporting the problem.
-
-.. _`@j0n-a`: https://github.com/PGScatalog/pgsc_calc/issues/79
-
-pgsc_calc v1.3.1 (2023-01-24)
------------------------------
-
-This patch fixes a bug that breaks the workflow if all variants in one or more
-PGS scoring files match perfectly with the target genomes. Thanks to
-`@lemieuxl`_ for reporting the problem!
-
-.. _`@lemieuxl`: https://github.com/PGScatalog/pgsc_calc/issues/75
-
-pgsc_calc v1.3.0 (2022-11-21)
------------------------------
-
-This release is focused on improving scalability.
-
-Features
-~~~~~~~~
-
-- Variant matching is made more efficient using a split - apply - combine
- approach when the data is split across chromosomes. This supports parallel PGS
- calculation for the largest traits (e.g. cancer, 418 PGS [avg 261,000
- variants/score) ) in the PGS Catalog on big datasets such as UK Biobank.
-
-- Better support for running in offline environments:
-
- - Internet access is only required to download scores by ID. Scores can be
- pre-downloaded using the utils package
- (https://pypi.org/project/pgscatalog-utils/)
-
- - Scoring file metadata is read from headers and displayed in the report
- (removed API calls during report generation)
-
-- Implemented flag (--efo_direct) to return only PGS tagged with exact EFO term
- (e.g. no PGS for child/descendant terms in the ontology)
-
-pgsc_calc v1.2.0 (2022-10-11)
------------------------------
-
-This release is focused on improving memory and storage usage.
-
-Features
-~~~~~~~~
-
-- Allow genotype dosages to be imported from VCF to be specified in ``vcf_genotype_field``
- of samplesheet_ (default: GT / hard calls)
-
-- Makes use of `durable caching`_ when relabelling and recoding target genomes (``--genotypes_cache``)
-
-- Improvements to use less storage space:
-
- - All intermediate files are now compressed by default
-
- - Add parameter to support zstd compressed input files
-
-- Improved memory usage when matching variants (``pgscatalog_utils=v0.1.2``
- https://github.com/PGScatalog/pgscatalog_utils)
-
-- Revised interface to select scores from the PGS Catalog using flags:
- ``--trait_efo`` (EFO ID / traits), ``--pgp_id`` (PGP ID / publications), ``--pgs_id`` (PGS ID, individual scores).
-
-.. _samplesheet: https://pgsc-calc.readthedocs.io/en/dev/reference/input.html
-.. _durable caching: https://pgsc-calc.readthedocs.io/en/dev/reference/params.html#parameter-schema
-
-pgsc_calc v1.1.0 (2022-09-15)
------------------------------
-
-The first public release of the pgsc_calc pipeline. This release adds compatibility
-for every score published in the PGS Catalog. Each scoring file in the PGS Catalog
-has been processed to provide consistent genomic coordinates in builds GRCh37 and GRCh38.
-The pipeline has been updated to take advantage of the harmonised scoring files (see
-`PGS Catalog downloads`_ for additional details).
-
-.. _PGS Catalog downloads: https://www.pgscatalog.org/downloads/#dl_ftp_scoring_hm_pos
-
-Features
-~~~~~~~~
-
-- Many of the underlying software tools are now implemented within a ``pgscatalog_utils``
- package (``v0.1.2``, https://github.com/PGScatalog/pgscatalog_utils and
- https://pypi.org/project/pgscatalog-utils/ ). The packaging allows for independent
- testing and development of tools for downloading and working with the scoring files.
-
-- The output report has been improved to have more detailed metadata describing
- the scoring files and how well the variants match the target sampleset(s).
-
-- Improvements to variant matching:
- - More precise control of variant matching parameters is now possible, like
- ignoring strand flips
- - ``match_variants`` should now use less RAM by default:
- - A laptop with 16GB of RAM should be able to comfortably calculate scores on
- the 1000 genomes dataset
- - Fast matching mode (``--fast_match``) is available if ~32GB of RAM is
- available and you'd like to calculate scores for larger datasets
-
-- Groups of scores from the PGS Catalog can be calculated by specifying a specific
- ``--trait`` (EFO ID) or ``--publication`` (PGP ID), in addition to using individual
- scoring files ``--pgs_id`` (PGS ID).
-
-- Score validation has been integrated with the test suite
-
-- Support for M1 Macs with ``--platform`` parameter (docker executor only)
-
-
-Bug fixes
-~~~~~~~~~
-
-- Implemented a more robust prioritisation procedure if a variant has multiple
- candidate matches or duplicated IDs
-
-- Fixed processing multiple samplesets in parallel (e.g. 1000 Genomes + UK
- Biobank)
-
-- When combining multiple scoring files, all variants are now kept to reflect the
- correct denominator for % matching statistics.
-
-- When trying to correct for strand flips the matched effect allele wasn't being
- correctly complemented
-
-pgsc_calc v1.0.0 (2022-05-24)
---------------------------------
-
-This release produces scores that should be biologically meaningful. Significant
-effort has been made to validate calculate scores on different datasets. In the
-next release we'll add score validation to our test suite to make sure
-calculated scores stay valid in the future.
-
-Features
-~~~~~~~~
-
-- Add support for PLINK2 format (samplesheet structure changed)
-- Add support for allosomes (e.g. X, Y)
-- Improve PGS Catalog compatibility (e.g. missing other allele)
-- Add automatic liftover of scoring files to match target genome build
-- Performance improvements to support UK BioBank scale data (500,000 genomes)
-- Support calculation of multiple scores in parallel
-- Significantly improved test coverage (> 80%)
-- Lots of other small changes to improve correctness and handling edge cases
-
-pgsc_calc v0.1.3dev (2022-02-04)
---------------------------------
-
-Features
-~~~~~~~~
-
-- Simplified JSON input processes
-- Add first draft of documentation
-- Add JSON schemas for validating input data (mostly for web platform)
-
-pgsc_calc v0.1.2dev (2022-01-17)
---------------------------------
-
-Features
-~~~~~~~~
-
-- Add JSON input support for web platform functionality
-- Set up simple CI tests with Github actions
-
-pgsc_calc v0.1.1dev (2021-12-16)
---------------------------------
-
-Features
-~~~~~~~~
-
-- First public release
-- Support applying a single scoring file to target genomic data in GrCh37 build
diff --git a/docs/conf.py b/docs/conf.py
index 86057e0a..8e6ac019 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -22,7 +22,7 @@
project = 'Polygenic Score (PGS) Catalog Calculator'
copyright = 'Polygenic Score (PGS) Catalog team (licensed under Apache License V2)'
-# author = 'Polygenic Score (PGS) Catalog team'
+author = 'Polygenic Score (PGS) Catalog team'
# -- General configuration ---------------------------------------------------
diff --git a/docs/explanation/geneticancestry.rst b/docs/explanation/geneticancestry.rst
index fe8660f5..c1176559 100644
--- a/docs/explanation/geneticancestry.rst
+++ b/docs/explanation/geneticancestry.rst
@@ -130,7 +130,13 @@ how-to guide), and has the following steps:
for variant-level QC (SNPs in Hardy–Weinberg equilibrium [p > 1e-04] that are bi-allelic and non-ambiguous,
with low missingness [<10%], and minor allele frequency [MAF > 5%]) and sample-quality (missingness <10%).
LD-pruning is then applied to the variants and sample passing these checks (r\ :sup:`2` threshold = 0.05), excluding
- complex regions with high LD (e.g. MHC). These methods are implemented in the ``FILTER_VARIANTS`` module.
+ complex regions with high LD (e.g. MHC). These methods are implemented in the ``FILTER_VARIANTS`` module, and
+ the default settings can be changed (see :doc:`schema (Reference options) `).
+
+ 1. **Additional variant filters on TARGET samples**: in ``v2.0.0-beta`` we introduced the ability to filter
+ target sample variants using a maximum genotype missingness [default 10%] and/or minimum MAF [default 0%] to
+ improve PCA robustness when using imputed genotype data (see :doc:`schema (Ancestry options) `).
+ *Note: these parameters may need to be adjusted depending on your input data.*
2. **PCA**: the LD-pruned variants of the unrelated samples passing QC are then used to define the PCA space of the
reference panel (default: 10 PCs) using `FRAPOSA`_ (Fast and Robust Ancestry Prediction by using Online singular
diff --git a/docs/explanation/match.rst b/docs/explanation/match.rst
index a072eb65..85449b4c 100644
--- a/docs/explanation/match.rst
+++ b/docs/explanation/match.rst
@@ -37,6 +37,8 @@ When you evaluate the predictive performance of a score with low match rates it
If you reduce ``--min_overlap`` then the calculator will output scores calculated with the remaining variants, **but these scores may not be representative of the original data submitted to the PGS Catalog.**
+.. _wgs:
+
Are your target genomes imputed? Are they WGS?
----------------------------------------------
@@ -49,7 +51,7 @@ In the future we plan to improve support for WGS.
Did you set the correct genome build?
-------------------------------------
-The calculator will automatically grab scoring files in the correct genome build from the PGS Catalog. If match rates are low it may be because you have specified the wrong genome build. If you're using custom scoring files and the match rate is low it is possible that the `--liftover` command may have been omitted.
+The calculator will automatically grab scoring files in the correct genome build from the PGS Catalog. If match rates are low it may be because you have specified the wrong genome build. If you're using custom scoring files and the match rate is low it is possible that the ``--liftover`` command may have been omitted.
I'm still getting match rate errors. How do I figure out what's wrong?
----------------------------------------------------------------------
diff --git a/docs/explanation/output.rst b/docs/explanation/output.rst
index 4aff03fe..37324e12 100644
--- a/docs/explanation/output.rst
+++ b/docs/explanation/output.rst
@@ -23,6 +23,7 @@ Calculated scores are stored in a gzipped-text space-delimted text file called
seperate row (``length = n_samples*n_pgs``), and there will be at least four columns with the following headers:
- ``sampleset``: the name of the input sampleset, or ``reference`` for the panel.
+- ``FID``: the family identifier of each sample within the dataset (may be the same as IID).
- ``IID``: the identifier of each sample within the dataset.
- ``PGS``: the accession ID of the PGS being reported.
- ``SUM``: reports the weighted sum of *effect_allele* dosages multiplied by their *effect_weight*
@@ -56,6 +57,7 @@ describing the analysis of the target samples in relation to the reference panel
following headers:
- ``sampleset``: the name of the input sampleset, or ``reference`` for the panel.
+- ``FID``: the family identifier of each sample within the dataset (may be the same as IID).
- ``IID``: the identifier of each sample within the dataset.
- ``[PC1 ... PCN]``: The projection of the sample within the PCA space defined by the reference panel. There will be as
many PC columns as there are PCs calculated (default: 10).
diff --git a/docs/getting-started.rst b/docs/getting-started.rst
index c2c4caf7..5b2dcf49 100644
--- a/docs/getting-started.rst
+++ b/docs/getting-started.rst
@@ -98,7 +98,7 @@ parameter:
--pgs_id PGS001229 # one score
--pgs_id PGS001229,PGS001405 # many scores separated by , (no spaces)
-.. note:: You can also select scores associated with traits (``--trait_efo``) and
+.. note:: You can also select scores associated with traits (``--efo_id``) and
publications (``--pgp_id``)
If you would like to use a custom scoring file not published in the PGS Catalog,
diff --git a/docs/how-to/bigjob.rst b/docs/how-to/bigjob.rst
index 68380088..8940b616 100644
--- a/docs/how-to/bigjob.rst
+++ b/docs/how-to/bigjob.rst
@@ -74,43 +74,132 @@ limits.
.. warning:: You'll probably want to use ``-profile singularity`` on a HPC. The
pipeline requires Singularity v3.7 minimum.
-However, in general you will have to adjust the ``executor`` options and job resource
-allocations (e.g. ``process_low``). Here's an example for an LSF cluster:
+Here's an example configuration running about 100 scores in parallel
+on UK Biobank with a SLURM cluster:
.. code-block:: text
process {
- queue = 'short'
- clusterOptions = ''
- scratch = true
+ errorStrategy = 'retry'
+ maxRetries = 3
+ maxErrors = '-1'
+ executor = 'slurm'
+
+ withName: 'DOWNLOAD_SCOREFILES' {
+ cpus = 1
+ memory = { 1.GB * task.attempt }
+ time = { 1.hour * task.attempt }
+ }
- withLabel:process_low {
- cpus = 2
- memory = 8.GB
- time = 1.h
+ withName: 'COMBINE_SCOREFILES' {
+ cpus = 1
+ memory = { 8.GB * task.attempt }
+ time = { 2.hour * task.attempt }
}
- withLabel:process_medium {
- cpus = 8
- memory = 64.GB
- time = 4.h
+
+ withName: 'PLINK2_MAKEBED' {
+ cpus = 2
+ memory = { 8.GB * task.attempt }
+ time = { 1.hour * task.attempt }
}
- }
- executor {
- name = 'lsf'
- jobName = { "$task.hash" }
- }
+ withName: 'RELABEL_IDS' {
+ cpus = 1
+ memory = { 16.GB * task.attempt }
+ time = { 1.hour * task.attempt }
+ }
+
+ withName: 'PLINK2_ORIENT' {
+ cpus = 2
+ memory = { 8.GB * task.attempt }
+ time = { 1.hour * task.attempt }
+ }
+
+ withName: 'DUMPSOFTWAREVERSIONS' {
+ cpus = 1
+ memory = { 1.GB * task.attempt }
+ time = { 1.hour * task.attempt }
+ }
+
+ withName: 'ANCESTRY_ANALYSIS' {
+ cpus = { 1 * task.attempt }
+ memory = { 8.GB * task.attempt }
+ time = { 1.hour * task.attempt }
+ }
+
+ withName: 'SCORE_REPORT' {
+ cpus = 2
+ memory = { 8.GB * task.attempt }
+ time = { 1.hour * task.attempt }
+ }
-In SLURM, queue is equivalent to a partition. Specific cluster parameters can be
-provided by modifying ``clusterOptions``. You should change ``cpus``,
-``memory``, and ``time`` to match the amount of resources used. Assuming the
-configuration file you set up is saved as ``my_custom.config`` in your current
-working directory, you're ready to run pgsc_calc. Instead of running nextflow
-directly on the shell, save a bash script (``run_pgscalc.sh``) to a file
-instead:
+ withName: 'EXTRACT_DATABASE' {
+ cpus = 1
+ memory = { 8.GB * task.attempt }
+ time = { 1.hour * task.attempt }
+ }
+
+ withName: 'PLINK2_RELABELPVAR' {
+ cpus = 2
+ memory = { 16.GB * task.attempt }
+ time = { 2.hour * task.attempt }
+ }
+
+ withName: 'INTERSECT_VARIANTS' {
+ cpus = 2
+ memory = { 8.GB * task.attempt }
+ time = { 1.hour * task.attempt }
+ }
+
+ withName: 'MATCH_VARIANTS' {
+ cpus = 2
+ memory = { 32.GB * task.attempt }
+ time = { 6.hour * task.attempt }
+ }
+
+ withName: 'FILTER_VARIANTS' {
+ cpus = 2
+ memory = { 16.GB * task.attempt }
+ time = { 1.hour * task.attempt }
+ }
+
+ withName: 'MATCH_COMBINE' {
+ cpus = 4
+ memory = { 64.GB * task.attempt }
+ time = { 6.hour * task.attempt }
+ }
+
+ withName: 'FRAPOSA_PCA' {
+ cpus = 2
+ memory = { 8.GB * task.attempt }
+ time = { 1.hour * task.attempt }
+ }
+
+ withName: 'PLINK2_SCORE' {
+ cpus = 2
+ memory = { 8.GB * task.attempt }
+ time = { 12.hour * task.attempt }
+ }
+
+ withName: 'SCORE_AGGREGATE' {
+ cpus = 2
+ memory = { 16.GB * task.attempt }
+ time = { 4.hour * task.attempt }
+ }
+ }
+
+Assuming the configuration file you set up is saved as
+``my_custom.config`` in your current working directory, you're ready
+to run pgsc_calc. Instead of running nextflow directly on the shell,
+save a bash script (``run_pgscalc.sh``) to a file instead:
.. code-block:: bash
-
+
+ #SBATCH -J ukbiobank_pgs
+ #SBATCH -c 1
+ #SBATCH -t 24:00:00
+ #SBATCH --mem=2G
+
export NXF_ANSI_LOG=false
export NXF_OPTS="-Xms500M -Xmx2G"
@@ -126,20 +215,23 @@ instead:
.. note:: The name of the nextflow and singularity modules will be different in
your local environment
- .. warning:: Make sure to copy input data to fast storage, and run the pipeline
- on the same fast storage area. You might include these steps in your
- bash script. Ask your sysadmin for help if you're not sure what this
- means.
+.. warning:: Make sure to copy input data to fast storage, and run the
+ pipeline on the same fast storage area. You might include
+ these steps in your bash script. Ask your sysadmin for
+ help if you're not sure what this means.
.. code-block:: console
- $ bsub -M 2GB -q short -o output.txt < run_pgscalc.sh
-
+ $ sbatch run_pgsc_calc.sh
+
This will submit a nextflow driver job, which will submit additional jobs for
-each process in the workflow. The nextflow driver requires up to 4GB of RAM
-(bsub's ``-M`` parameter) and 2 CPUs to use (see a guide for `HPC users`_ here).
+each process in the workflow. The nextflow driver requires up to 4GB of RAM and 2 CPUs to use (see a guide for `HPC users`_ here).
-.. _`LSF and PBS`: https://nextflow.io/docs/latest/executor.html#slurm
.. _`HPC users`: https://www.nextflow.io/blog/2021/5_tips_for_hpc_users.html
.. _`a nextflow profile`: https://github.com/nf-core/configs
+
+Cloud deployments
+-----------------
+
+We've deployed the calculator to Google Cloud Batch but some :doc:`special configuration is required`.
diff --git a/docs/how-to/cache.rst b/docs/how-to/cache.rst
index 6cad3a0b..b4f08697 100644
--- a/docs/how-to/cache.rst
+++ b/docs/how-to/cache.rst
@@ -1,23 +1,26 @@
.. _cache:
-How do I speed up `pgsc_calc` computation times and avoid re-running code?
-==========================================================================
+How do I speed up computation times and avoid re-running code?
+==============================================================
-If you intend to run `pgsc_calc` multiple times on the same target samples (e.g.
+If you intend to run ``pgsc_calc`` multiple times on the same target samples (e.g.
on different sets of PGS, with different variant matching flags) it is worth cacheing
information on invariant steps of the pipeline:
- Genotype harmonzation (variant relabeling steps)
-- Steps of `--run_ancestry` that: match variants between the target and reference panel and
+- Steps of ``--run_ancestry`` that: match variants between the target and reference panel and
generate PCA loadings that can be used to adjust the PGS for ancestry.
-To do this you must specify a directory that can store these information across runs using the
-`--genotypes_cache` flag to the nextflow command (also see :ref:`param ref`). Future runs of the
-pipeline that use the same cache directory should then skip these steps and proceed to run only the
-steps needed to calculate new PGS. This is slightly different than using the `-resume command in
-nextflow `_ which mainly checks the
-`work` directory and is more often used for restarting the pipeline when a specific step has failed
-(e.g. for exceeding memory limits).
+To do this you must specify a directory that can store these
+information across runs using the ``--genotypes_cache`` flag to the
+nextflow command (also see :ref:`param ref`). Future runs of the
+pipeline that use the same cache directory should then skip these
+steps and proceed to run only the steps needed to calculate new PGS.
+This is slightly different than using the `-resume command in nextflow
+`_
+which mainly checks the ``work`` directory and is more often used for
+restarting the pipeline when a specific step has failed (e.g. for
+exceeding memory limits).
.. warning:: Always use a new cache directory for different samplesets, as redundant names may clash across runs.
diff --git a/docs/how-to/calculate_custom.rst b/docs/how-to/calculate_custom.rst
index 77333dee..5d7f17b4 100644
--- a/docs/how-to/calculate_custom.rst
+++ b/docs/how-to/calculate_custom.rst
@@ -26,7 +26,7 @@ minimal header in the following format:
Header::
#pgs_name=metaGRS_CAD
- #pgs_name=metaGRS_CAD
+ #pgs_id=metaGRS_CAD
#trait_reported=Coronary artery disease
#genome_build=GRCh37
diff --git a/docs/how-to/calculate_pgscatalog.rst b/docs/how-to/calculate_pgscatalog.rst
index c04d7877..dc2cd55f 100644
--- a/docs/how-to/calculate_pgscatalog.rst
+++ b/docs/how-to/calculate_pgscatalog.rst
@@ -46,12 +46,12 @@ Traits
~~~~~~
If you would like to calculate every polygenic score in the Catalog for a
-`trait`_, like `coronary artery disease`_, then you can use the ``--trait_efo``
+`trait`_, like `coronary artery disease`_, then you can use the ``--efo_id``
parameter:
.. code-block:: console
- --trait_efo EFO_0001645
+ --efo_id EFO_0001645
Multiple traits can be set by using a comma separated list.
@@ -85,7 +85,7 @@ Multiple traits can be set by using a comma separated list.
-profile \
--input samplesheet.csv \
--pgs_id PGS001229 \
- --trait_efo EFO_0001645 \
+ --efo_id EFO_0001645 \
--pgp_id PGP000001
.. note:: For more details about calculating multiple scores, see :ref:`multiple`
diff --git a/docs/how-to/multiple.rst b/docs/how-to/multiple.rst
index e8f44889..84d98f46 100644
--- a/docs/how-to/multiple.rst
+++ b/docs/how-to/multiple.rst
@@ -133,7 +133,7 @@ Congratulations, you've now calculated multiple scores in parallel!
combine scores in the PGS Catalog with your own custom scores
After the workflow executes successfully, the calculated scores and a summary
-report should be available in the ``results/make/`` directory by default. If
+report should be available in the ``results/`` directory by default. If
you're interested in more information, see :ref:`interpret`.
If the workflow didn't execute successfully, have a look at the
diff --git a/docs/how-to/offline.rst b/docs/how-to/offline.rst
index a77bf118..ca9e8da4 100644
--- a/docs/how-to/offline.rst
+++ b/docs/how-to/offline.rst
@@ -127,8 +127,12 @@ panel too. See :ref:`norm`.
Download scoring files
----------------------
-It's best to manually download scoring files from the PGS Catalog in the correct
-genome build. Using PGS001229 as an example:
+.. tip:: Use our CLI application ``pgscatalog-download`` to `download multiple scoring`_ files in parallel and the correct genome build
+
+.. _download multiple scoring: https://pygscatalog.readthedocs.io/en/latest/how-to/guides/download.html
+
+You'll need to preload scoring files in the correct genome build.
+Using PGS001229 as an example:
https://ftp.ebi.ac.uk/pub/databases/spot/pgs/scores/PGS001229/ScoringFiles/
diff --git a/docs/how-to/prepare.rst b/docs/how-to/prepare.rst
index d74427bc..ec174c4f 100644
--- a/docs/how-to/prepare.rst
+++ b/docs/how-to/prepare.rst
@@ -52,6 +52,8 @@ VCF from WGS
See https://github.com/PGScatalog/pgsc_calc/discussions/123 for discussion about tools
to convert the VCF files into ones suitable for calculating PGS.
+If you input WGS data to the calculator without following the steps above then you will probably encounter match rate errors. For more information, see: :ref:`wgs`
+
``plink`` binary fileset (bfile)
--------------------------------
diff --git a/docs/how-to/samplesheet.rst b/docs/how-to/samplesheet.rst
index 49cb648f..f94a84e6 100644
--- a/docs/how-to/samplesheet.rst
+++ b/docs/how-to/samplesheet.rst
@@ -27,7 +27,7 @@ download here <../../assets/examples/samplesheet.csv>`.
There are four mandatory columns:
-- **sampleset**: A text string (no spaces, or reserved characters [ '.' or '_' ]) referring
+- **sampleset**: A text string (no spaces, or reserved characters [ ``.`` or ``_`` ]) referring
to the name of a :term:`target dataset` of genotyping data containing at least one
sample/individual (however cohort datasets will often contain many individuals with
combined genotyped/imputed data). Data from a sampleset may be input as a single file,
@@ -61,12 +61,11 @@ There are four mandatory columns:
Notes
~~~~~
-.. note:: Multiple samplesheet rows are typically only needed if:
-
- - The target genomes are split to have a one file per chromosome
- - You're working with multiple cohorts simultaneously
+.. danger:: Always include every target genome chromosome in your samplesheet unless you're certain that missing chromosomes aren't in the scoring files
+
+.. note:: Multiple samplesheet rows are typically only needed if the target genomes are split to have a one file per chromosome
-.. danger:: All samplesets have to be in the same genome build (either GRCh37 or
+.. danger:: All target genome files have to be in the same genome build (either GRCh37 or
GRCh38) which is specified using the ``--target_build [GRCh3#]``
command. All scoring files are downloaded or mapped to match the specified
genome build, no liftover/re-mapping of the genotyping data is performed
@@ -90,10 +89,7 @@ There is one optional column:
imputation tools (Michigan or TopMed Imputation Servers) that output dosages for the
ALT allele(s): to extract these data users should enter ``DS`` in this column.
-An example of a samplesheet with two VCF datasets where you'd like to import
-different genotypes from each is below:
-
-.. list-table:: Example samplesheet with genotype field set
+.. list-table:: Example samplesheet with genotype field set to hard-calls (default)
:header-rows: 1
* - sampleset
@@ -106,6 +102,15 @@ different genotypes from each is below:
- 22
- vcf
- ``GT``
+
+.. list-table:: Example samplesheet with genotype field set to dosage
+ :header-rows: 1
+
+ * - sampleset
+ - path_prefix
+ - chrom
+ - format
+ - vcf_genotype_field
* - cineca_imputed
- path/to/vcf_imputed
- 22
diff --git a/docs/index.rst b/docs/index.rst
index dca0cb76..bec94718 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -54,7 +54,7 @@ The workflow relies on open source scientific software, including:
A full description of included software is described in :ref:`containers`.
.. _PLINK 2: https://www.cog-genomics.org/plink/2.0/
-.. _PGS Catalog Utilities: https://github.com/PGScatalog/pgscatalog_utils
+.. _PGS Catalog Utilities: https://github.com/PGScatalog/pygscatalog
.. _FRAPOSA: https://github.com/PGScatalog/fraposa_pgsc
@@ -120,7 +120,10 @@ Documentation
Changelog
---------
-The :doc:`Changelog page` describes fixes and enhancements for each version.
+The `Changelog page`_ describes fixes and enhancements for each version.
+
+.. _`Changelog page`: https://github.com/PGScatalog/pgsc_calc/releases
+
Features under development
--------------------------
diff --git a/environments/fraposa/environment.yml b/environments/fraposa/environment.yml
index b5c27de0..b69cb02a 100644
--- a/environments/fraposa/environment.yml
+++ b/environments/fraposa/environment.yml
@@ -2,5 +2,6 @@ name: fraposa-pgsc
channels:
- conda-forge
- bioconda
+ - nodefaults
dependencies:
-- fraposa-pgsc=0.1.1
\ No newline at end of file
+ - fraposa-pgsc=1.0.0
\ No newline at end of file
diff --git a/environments/pgscatalog_utils/environment.yml b/environments/pgscatalog_utils/environment.yml
index fac6ef15..86c3f114 100644
--- a/environments/pgscatalog_utils/environment.yml
+++ b/environments/pgscatalog_utils/environment.yml
@@ -2,5 +2,6 @@ name: pgscatalog-utils
channels:
- conda-forge
- bioconda
+ - nodefaults
dependencies:
-- pgscatalog-utils=1.1.2
+ - pgscatalog-utils=1.2.0
diff --git a/environments/plink2/environment.yml b/environments/plink2/environment.yml
new file mode 100644
index 00000000..1f3bc091
--- /dev/null
+++ b/environments/plink2/environment.yml
@@ -0,0 +1,7 @@
+name: plink2
+channels:
+ - conda-forge
+ - bioconda
+ - nodefaults
+dependencies:
+ - plink2==2.00a5.10
\ No newline at end of file
diff --git a/environments/pyyaml/environment.yml b/environments/pyyaml/environment.yml
index 15cb9b73..a7573aff 100644
--- a/environments/pyyaml/environment.yml
+++ b/environments/pyyaml/environment.yml
@@ -2,6 +2,6 @@ name: pyyaml
channels:
- conda-forge
- bioconda
- - defaults
+ - nodefaults
dependencies:
-- pyyaml=6.0.1
+ - pyyaml=6.0.1
diff --git a/environments/report/environment.yml b/environments/report/environment.yml
index 9b9bc4b5..43adc80f 100644
--- a/environments/report/environment.yml
+++ b/environments/report/environment.yml
@@ -2,6 +2,7 @@ name: report
channels:
- conda-forge
- bioconda
+ - nodefaults
dependencies:
- r-jsonlite
- r-dplyr
diff --git a/environments/zstd/environment.yml b/environments/zstd/environment.yml
index 39242d54..856cc981 100644
--- a/environments/zstd/environment.yml
+++ b/environments/zstd/environment.yml
@@ -2,5 +2,6 @@ name: zstd
channels:
- conda-forge
- bioconda
+ - nodefaults
dependencies:
-- zstd=1.4.8
+ - zstd=1.4.8
diff --git a/lib/WorkflowPgscCalc.groovy b/lib/WorkflowPgscCalc.groovy
index 60c4e9fe..d95c68fb 100755
--- a/lib/WorkflowPgscCalc.groovy
+++ b/lib/WorkflowPgscCalc.groovy
@@ -11,8 +11,8 @@ class WorkflowPgscCalc {
// Check and validate parameters
//
public static void initialise(params, log) {
- if (![params.scorefile, params.pgs_id, params.trait_efo, params.pgp_id].any()) {
- Nextflow.error " ERROR: You didn't set any scores to use! Please set --scorefile, --pgs_id, --trait_efo, or --pgp_id"
+ if (![params.scorefile, params.pgs_id, params.trait_efo, params.efo_id, params.pgp_id].any()) {
+ Nextflow.error " ERROR: You didn't set any scores to use! Please set --scorefile, --pgs_id, --efo_id, or --pgp_id"
}
if (!params.target_build) {
diff --git a/modules/local/ancestry/intersect_variants.nf b/modules/local/ancestry/intersect_variants.nf
index efcad0b6..e5c2efe3 100644
--- a/modules/local/ancestry/intersect_variants.nf
+++ b/modules/local/ancestry/intersect_variants.nf
@@ -33,8 +33,8 @@ process INTERSECT_VARIANTS {
pgscatalog-intersect --ref $ref_variants \
--target $variants \
--chrom $meta.chrom \
- --maf_target 0.1 \
- --geno_miss 0.1 \
+ --maf_target $params.pca_maf_target \
+ --geno_miss $params.pca_geno_miss_target \
--outdir . \
-v
diff --git a/modules/local/ancestry/oadp/fraposa_project.nf b/modules/local/ancestry/oadp/fraposa_project.nf
index acc831f8..0154c400 100644
--- a/modules/local/ancestry/oadp/fraposa_project.nf
+++ b/modules/local/ancestry/oadp/fraposa_project.nf
@@ -33,7 +33,7 @@ process FRAPOSA_PROJECT {
--method $params.projection_method \
--dim_ref 10 \
--stu_filepref ${target_geno.baseName} \
- --stu_filt_iid <(cut -f1 $split_fam) \
+ --stu_filt_iid $split_fam \
--out ${target_geno.baseName}_${split_fam}
cat <<-END_VERSIONS > versions.yml
diff --git a/modules/local/ancestry/oadp/plink2_makebed.nf b/modules/local/ancestry/oadp/plink2_makebed.nf
index 118b7716..67f8a3f1 100644
--- a/modules/local/ancestry/oadp/plink2_makebed.nf
+++ b/modules/local/ancestry/oadp/plink2_makebed.nf
@@ -52,7 +52,7 @@ process PLINK2_MAKEBED {
if [ $meta.id != 'reference' ]
then
- split -l 50000 <(grep -v '#' $pheno) ${split_output}
+ split -l 50000 ${output}.fam ${split_output}
fi
cat <<-END_VERSIONS > versions.yml
diff --git a/modules/local/plink2_score.nf b/modules/local/plink2_score.nf
index 8842bede..6e250f2e 100644
--- a/modules/local/plink2_score.nf
+++ b/modules/local/plink2_score.nf
@@ -45,7 +45,7 @@ process PLINK2_SCORE {
def no_imputation = ((ref_afreq.name == 'NO_FILE') && (meta.n_samples.toInteger() < 50)) ? "no-mean-imputation" : ""
def error_on_freq_calc = (no_imputation == "no-mean-imputation") ? "--error-on-freq-calc" : ""
- def cols = 'header-read cols=+scoresums,+denom,-fid'
+ def cols = 'header-read cols=+scoresums,+denom,+fid'
def recessive = (scoremeta.effect_type == 'recessive') ? ' recessive ' : ''
def dominant = (scoremeta.effect_type == 'dominant') ? ' dominant ' : ''
args2 = [args2, cols, 'list-variants', no_imputation, recessive, dominant, error_on_freq_calc].join(' ')
diff --git a/nextflow.config b/nextflow.config
index 480ab095..ca1b7431 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -14,6 +14,7 @@ params {
scorefile = null
pgs_id = null
trait_efo = null
+ efo_id = null // synonym for trait_efo, which is deprecated
pgp_id = null
efo_direct = false
@@ -43,6 +44,8 @@ params {
n_popcomp = 5
normalization_method = "empirical mean mean+var"
n_normalization = 4
+ pca_maf_target = 0
+ pca_geno_miss_target = 0.1
// compatibility params
liftover = false
@@ -256,7 +259,7 @@ manifest {
description = 'The Polygenic Score Catalog Calculator is a nextflow pipeline for polygenic score calculation'
mainScript = 'main.nf'
nextflowVersion = '>=23.10.0'
- version = '2.0.0-beta.1'
+ version = '2.0.0-beta.2'
}
// Load modules.config for DSL2 module specific options
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 6caed387..14686e17 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -1,33 +1,49 @@
{
- "$schema": "https://json-schema.org/draft/2020-12/schema",
+ "$schema": "http://json-schema.org/draft-07/schema",
"$id": "https://raw.githubusercontent.com/pgscatalog/pgsc_calc/master/nextflow_schema.json",
"title": "pgscatalog/pgsc_calc pipeline parameters",
- "description": "This pipeline applies scoring files from the PGS Catalog to target set(s) of genotyped samples",
+ "description": "The Polygenic Score Catalog Calculator is a nextflow pipeline for polygenic score calculation",
"type": "object",
- "defs": {
+ "definitions": {
"input_output_options": {
"title": "Input/output options",
"type": "object",
"fa_icon": "fas fa-terminal",
"description": "Define where the pipeline should find input data and save output data.",
+ "required": ["input", "outdir"],
"properties": {
"input": {
"type": "string",
- "description": "Path to input samplesheet",
- "format": "file-path"
+ "format": "file-path",
+ "exists": true,
+ "pattern": "^\\S+\\.(csv|json)$",
+ "description": "Path to comma-separated file containing information about the samples in the experiment.",
+ "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row.",
+ "fa_icon": "fas fa-file-csv"
+ },
+ "outdir": {
+ "type": "string",
+ "format": "directory-path",
+ "description": "The output directory where the results will be saved. You have to use absolute paths to storage on Cloud infrastructure.",
+ "fa_icon": "fas fa-folder-open",
+ "default": "results"
+ },
+ "email": {
+ "type": "string",
+ "description": "Email address for completion summary.",
+ "fa_icon": "fas fa-envelope",
+ "help_text": "Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits. If set in your user config file (`~/.nextflow/config`) then you don't need to specify this on the command line for every run.",
+ "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$"
},
"format": {
"type": "string",
"default": "csv",
- "fa_icon": "fas fa-cog",
- "description": "Format of input samplesheet",
- "enum": ["csv", "json"]
+ "enum": ["csv", "json"],
+ "description": "What format is the samplesheet in? (csv/json)"
},
"scorefile": {
"type": "string",
- "description": "Path to a scoring file in PGS Catalog format. Multiple scorefiles can be specified using wildcards (e.g., ``--scorefile \"path/to/scores/*.txt\"``)",
- "fa_icon": "fas fa-file-alt",
- "format": "file-path"
+ "description": "Path to a scoring file in PGS Catalog format. Multiple scorefiles can be specified using wildcards (e.g., ``--scorefile \\\"path/to/scores/*.txt\\\"``)"
},
"pgs_id": {
"type": "string",
@@ -39,10 +55,14 @@
"description": "A comma separated list of PGS Catalog publications, e.g. PGP000001",
"pattern": "PGP[0-9]{6}"
},
- "trait_efo": {
+ "efo_id": {
"type": "string",
"description": "A comma separated list of PGS Catalog EFO traits, e.g. EFO_0004214"
},
+ "trait_efo": {
+ "type": "string",
+ "description": "DEPRECATED: A comma separated list of PGS Catalog EFO traits, e.g. EFO_0004214"
+ },
"efo_direct": {
"type": "boolean",
"description": "Return only PGS tagged with exact EFO term (e.g. no PGS for child/descendant terms in the ontology)"
@@ -55,35 +75,68 @@
"type": "string",
"description": "Path to a directory that can store relabelled genotypes (and the reference panel intersections and PCA with --run_ancestry) to speed up new PGS calculations on previously harmonized samples",
"format": "directory-path"
- },
- "outdir": {
+ }
+ }
+ },
+ "compatibility_options": {
+ "title": "Compatibility options",
+ "type": "object",
+ "description": "Define parameters that control how scoring files and target genomes are made compatible with each other",
+ "default": "",
+ "properties": {
+ "target_build": {
"type": "string",
- "description": "Path to the output directory where the results will be saved.",
- "fa_icon": "fas fa-folder-open",
- "format": "directory-path",
- "default": "results"
+ "description": "Genome build of target genomes",
+ "enum": ["GRCh37", "GRCh38"]
},
- "email": {
- "type": "string",
- "description": "Email address for completion summary.",
- "fa_icon": "fas fa-envelope",
- "help_text": "Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits. If set in your user config file (`~/.nextflow/config`) then you don't need to specify this on the command line for every run.",
- "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$"
+ "liftover": {
+ "type": "boolean",
+ "description": "Lift scoring files to match your target genomes. Requires build information in the header of the scoring files."
+ },
+ "min_lift": {
+ "type": "number",
+ "default": 0.95,
+ "description": "Minimum proportion of variants required to successfully remap a scoring file to a different genome build.",
+ "minimum": 0,
+ "maximum": 1
}
},
- "required": ["input", "format"]
+ "required": ["target_build"]
+ },
+ "matching_options": {
+ "title": "Matching options",
+ "type": "object",
+ "description": "Define how variants are matched across scoring files and target genomes.",
+ "default": "",
+ "properties": {
+ "keep_multiallelic": {
+ "type": "boolean",
+ "description": "Allow matches of scoring file variants to multiallelic variants in the target dataset"
+ },
+ "keep_ambiguous": {
+ "type": "boolean",
+ "description": "Keep matches of scoring file variants to strand ambiguous variants (e.g. A/T and C/G SNPs) in the target dataset. This assumes the scoring file and target dataset report variants on the same strand."
+ },
+ "min_overlap": {
+ "type": "number",
+ "default": 0.75,
+ "minimum": 0,
+ "maximum": 1,
+ "description": "Minimum proportion of variants present in both the score file and input target genomic data"
+ }
+ }
},
- "ancestry_options": {
- "title": "Ancestry options",
+ "genetic_ancestry_options": {
+ "title": "Genetic ancestry options",
"type": "object",
- "description": "",
+ "description": "Parameters used to control genetic ancestry similarity analysis on TARGET samples and variants included in PCA",
"default": "",
"properties": {
"projection_method": {
"type": "string",
"default": "oadp",
- "enum": ["oadp", "sp", "adp"],
- "description": "The method for PCA prediction. oadp: most accurate. adp: accurate but slow. sp: fast but inaccurate."
+ "description": "The method for PCA prediction. oadp: most accurate. adp: accurate but slow. sp: fast but inaccurate.",
+ "enum": ["oadp", "sp", "adp"]
},
"ancestry_method": {
"type": "string",
@@ -104,8 +157,8 @@
"normalization_method": {
"type": "string",
"default": "empirical mean mean+var",
- "description": "Method used for normalisation of genetic ancestry",
- "enum": ["empirical", "mean", "mean+var", "empirical mean mean+var"]
+ "enum": ["empirical", "mean", "mean+var", "empirical mean mean+var"],
+ "description": "Method used for normalisation of genetic ancestry"
},
"n_normalization": {
"type": "integer",
@@ -116,6 +169,20 @@
"type": "boolean",
"default": true,
"description": "Load allelic frequencies from reference panel when scoring target genomes"
+ },
+ "pca_maf_target": {
+ "type": "number",
+ "default": 0,
+ "description": "Minimum MAF threshold in TARGET samples for variants to be included in the PCA.",
+ "minimum": 0,
+ "maximum": 1
+ },
+ "pca_geno_miss_target": {
+ "type": "number",
+ "default": 0.1,
+ "description": "Maximum genotype missingness threshold in TARGET samples for variants to be included in the PCA.",
+ "minimum": 0,
+ "maximum": 1
}
},
"required": [
@@ -125,23 +192,26 @@
"n_popcomp",
"normalization_method",
"n_normalization",
- "load_afreq"
+ "load_afreq",
+ "pca_maf_target",
+ "pca_geno_miss_target"
]
},
"reference_options": {
"title": "Reference options",
"type": "object",
- "description": "Define how reference genomes are defined and processed",
+ "description": "Define how genomes and variants in REFERENCE panel are defined and processed for PCA",
"default": "",
"properties": {
"run_ancestry": {
"type": "string",
- "format": "file-path",
- "description": "Path to reference database. Must be set if --ref_samplesheet is not set."
+ "description": "Path to reference database. Must be set if --ref_samplesheet is not set.",
+ "format": "file-path"
},
"ref_samplesheet": {
"type": "string",
- "description": "Path to a samplesheet that describes the structure of reference data. Must be set if --ref isn't set."
+ "description": "Path to a samplesheet that describes the structure of reference data. Must be set if --ref isn't set.",
+ "format": "file-path"
},
"hg19_chain": {
"type": "string",
@@ -153,29 +223,30 @@
"hg38_chain": {
"type": "string",
"description": "Path to a UCSC chain file for converting from hg38 to hg19. Needed if lifting over a custom scoring file.",
+ "pattern": ".*chain.gz$",
"format": "file-path",
"mimetype": "application/gzip"
},
"geno_ref": {
"type": "number",
"default": 0.1,
- "description": "Exclude variants with missing call frequencies greater than a threshold (in reference genomes)",
+ "description": "Exclude VARIANTS with percentage of missing genotype calls greater than a threshold (in reference genomes)",
"minimum": 0,
"maximum": 1
},
"mind_ref": {
"type": "number",
"default": 0.1,
- "description": "Exclude samples with missing call frequencies greater than a threshold (in reference genomes)",
"minimum": 0,
- "maximum": 1
+ "maximum": 1,
+ "description": "Exclude SAMPLES with percentage of missing genotype calls greater than a threshold (in reference genomes)"
},
"maf_ref": {
"type": "number",
"default": 0.05,
- "description": "Exclude variants with allele frequency lower than a threshold (in reference genomes)",
"minimum": 0,
- "maximum": 1
+ "maximum": 1,
+ "description": "Exclude variants with minor allele frequency (MAF) lower than a threshold (in reference genomes)"
},
"hwe_ref": {
"type": "number",
@@ -191,83 +262,113 @@
},
"ld_grch37": {
"type": "string",
+ "default": "/Users/bwingfield/Documents/projects/pgsc_calc/assets/ancestry/high-LD-regions-hg19-GRCh37.txt",
"description": "Path to a file that contains areas of high linkage disequilibrium in the reference data (build GRCh37).",
"format": "file-path",
"mimetype": "text/plain"
},
"ld_grch38": {
"type": "string",
+ "default": "/Users/bwingfield/Documents/projects/pgsc_calc/assets/ancestry/high-LD-regions-hg38-GRCh38.txt",
"description": "Path to a file that contains areas of high linkage disequilibrium in the reference data (build GRCh38).",
"format": "file-path",
"mimetype": "text/plain"
},
"ref_format_version": {
"type": "string",
- "default": "v0.1"
+ "default": "v0.1",
+ "description": "Version of the default reference database"
},
"ancestry_checksums": {
- "type": "string"
+ "type": "string",
+ "description": "Used to validate files in the reference database when built"
}
- },
- "required": [
- "geno_ref",
- "mind_ref",
- "maf_ref",
- "hwe_ref",
- "indep_pairwise_ref",
- "ld_grch37",
- "ld_grch38"
- ]
+ }
},
- "compatibility_options": {
- "title": "Compatibility options",
+ "developer_options": {
+ "title": "Developer options",
"type": "object",
- "description": "Define parameters that control how scoring files and target genomes are made compatible with each other",
+ "description": "Control subworkflow execution, useful for debugging",
"default": "",
"properties": {
- "target_build": {
- "type": "string",
- "enum": ["GRCh37", "GRCh38"],
- "description": "Genome build of target genomes"
+ "only_bootstrap": {
+ "type": "boolean",
+ "hidden": true
},
- "liftover": {
+ "only_input": {
"type": "boolean",
- "description": "Lift scoring files to match your target genomes. Requires build information in the header of the scoring files."
+ "hidden": true
},
- "min_lift": {
- "type": "number",
- "default": 0.95,
- "description": "Minimum proportion of variants required to successfully remap a scoring file to a different genome build",
- "minimum": 0,
- "maximum": 1
+ "only_compatible": {
+ "type": "boolean",
+ "hidden": true
+ },
+ "only_match": {
+ "type": "boolean",
+ "hidden": true
+ },
+ "only_projection": {
+ "type": "boolean",
+ "hidden": true
+ },
+ "only_score": {
+ "type": "boolean",
+ "hidden": true
+ },
+ "skip_ancestry": {
+ "type": "boolean",
+ "default": true,
+ "hidden": true
}
- },
- "required": ["target_build"]
+ }
},
- "matching_options": {
- "title": "Matching options",
+ "institutional_config_options": {
+ "title": "Institutional config options",
"type": "object",
- "description": "Define how variants are matched across scoring files and target genomes",
- "default": "",
+ "fa_icon": "fas fa-university",
+ "description": "Parameters used to describe centralised config profiles. These should not be edited.",
+ "help_text": "The centralised nf-core configuration profiles use a handful of pipeline parameters to describe themselves. This information is then printed to the Nextflow log when you run a pipeline. You should not need to change these values when you run a pipeline.",
"properties": {
- "keep_multiallelic": {
- "type": "boolean",
- "description": "Allow matches of scoring file variants to multiallelic variants in the target dataset"
+ "custom_config_version": {
+ "type": "string",
+ "description": "Git commit id for Institutional configs.",
+ "default": "master",
+ "hidden": true,
+ "fa_icon": "fas fa-users-cog"
},
- "keep_ambiguous": {
- "type": "boolean",
- "description": "Keep matches of scoring file variants to strand ambiguous variants (e.g. A/T and C/G SNPs) in the target dataset. This assumes the scoring file and target dataset report variants on the same strand."
+ "custom_config_base": {
+ "type": "string",
+ "description": "Base directory for Institutional configs.",
+ "default": "https://raw.githubusercontent.com/nf-core/configs/master",
+ "hidden": true,
+ "help_text": "If you're running offline, Nextflow will not be able to fetch the institutional config files from the internet. If you don't need them, then this is not a problem. If you do need them, you should download the files from the repo and tell Nextflow where to find them with this parameter.",
+ "fa_icon": "fas fa-users-cog"
},
- "min_overlap": {
- "type": "number",
- "default": 0.75,
- "description": "Minimum proportion of variants present in both the score file and input target genomic data",
- "fa_icon": "fas fa-cog",
- "minimum": 0,
- "maximum": 1
+ "config_profile_name": {
+ "type": "string",
+ "description": "Institutional config name.",
+ "hidden": true,
+ "fa_icon": "fas fa-users-cog"
+ },
+ "config_profile_description": {
+ "type": "string",
+ "description": "Institutional config description.",
+ "hidden": true,
+ "fa_icon": "fas fa-users-cog"
+ },
+ "config_profile_contact": {
+ "type": "string",
+ "description": "Institutional config contact information.",
+ "hidden": true,
+ "fa_icon": "fas fa-users-cog"
+ },
+ "config_profile_url": {
+ "type": "string",
+ "description": "Institutional config URL link.",
+ "hidden": true,
+ "fa_icon": "fas fa-users-cog"
}
- },
- "fa_icon": "fas fa-user-cog"
+ }
},
"max_job_request_options": {
"title": "Max job request options",
@@ -298,7 +399,7 @@
"description": "Maximum amount of time that can be requested for any single job.",
"default": "240.h",
"fa_icon": "far fa-clock",
- "pattern": "^(\\d+\\.?\\s*(s|m|h|day)\\s*)+$",
+ "pattern": "^(\\d+\\.?\\s*(s|m|h|d|day)\\s*)+$",
"hidden": true,
"help_text": "Use to set an upper-limit for the time requirement for each process. Should be a string in the format integer-unit e.g. `--max_time '2.h'`"
}
@@ -317,6 +418,12 @@
"fa_icon": "fas fa-question-circle",
"hidden": true
},
+ "version": {
+ "type": "boolean",
+ "description": "Display version and exit.",
+ "fa_icon": "fas fa-question-circle",
+ "hidden": true
+ },
"publish_dir_mode": {
"type": "string",
"default": "copy",
@@ -352,88 +459,44 @@
"description": "Do not use coloured log outputs.",
"fa_icon": "fas fa-palette",
"hidden": true
+ },
+ "hook_url": {
+ "type": "string",
+ "description": "Incoming hook URL for messaging service",
+ "fa_icon": "fas fa-people-group",
+ "help_text": "Incoming hook URL for messaging service. Currently, MS Teams and Slack are supported.",
+ "hidden": true
}
}
}
},
"allOf": [
{
- "$ref": "#/defs/input_output_options"
+ "$ref": "#/definitions/input_output_options"
},
{
- "$ref": "#/defs/ancestry_options"
+ "$ref": "#/definitions/compatibility_options"
},
{
- "$ref": "#/defs/reference_options"
+ "$ref": "#/definitions/matching_options"
},
{
- "$ref": "#/defs/compatibility_options"
+ "$ref": "#/definitions/genetic_ancestry_options"
},
{
- "$ref": "#/defs/matching_options"
+ "$ref": "#/definitions/reference_options"
},
{
- "$ref": "#/defs/max_job_request_options"
+ "$ref": "#/definitions/developer_options"
},
{
- "$ref": "#/defs/generic_options"
- }
- ],
- "properties": {
- "only_bootstrap": {
- "type": "boolean",
- "hidden": true
- },
- "only_input": {
- "type": "boolean",
- "hidden": true
- },
- "only_compatible": {
- "type": "boolean",
- "hidden": true
- },
- "only_match": {
- "type": "boolean",
- "hidden": true
+ "$ref": "#/definitions/institutional_config_options"
},
- "only_projection": {
- "type": "boolean",
- "hidden": true
- },
- "only_score": {
- "type": "boolean",
- "hidden": true
- },
- "skip_ancestry": {
- "type": "boolean",
- "default": true,
- "hidden": true
- },
- "hook_url": {
- "type": "string"
- },
- "version": {
- "type": "boolean"
- },
- "config_profile_name": {
- "type": "string"
- },
- "config_profile_description": {
- "type": "string"
- },
- "custom_config_version": {
- "type": "string",
- "default": "master"
- },
- "custom_config_base": {
- "type": "string",
- "default": "https://raw.githubusercontent.com/nf-core/configs/master"
- },
- "config_profile_contact": {
- "type": "string"
+ {
+ "$ref": "#/definitions/max_job_request_options"
},
- "config_profile_url": {
- "type": "string"
+ {
+ "$ref": "#/definitions/generic_options"
}
- }
+ ]
}
diff --git a/tests/modules/combine/test.yml b/tests/modules/combine/test.yml
index 0dcf4c16..ba6c16dc 100644
--- a/tests/modules/combine/test.yml
+++ b/tests/modules/combine/test.yml
@@ -14,4 +14,4 @@
- "effect_weight"
- path: output/combine/versions.yml
contains:
- - "pgscatalog.core: 0.2.1"
\ No newline at end of file
+ - "pgscatalog.core: 0.2.2"
\ No newline at end of file
diff --git a/tests/modules/download/test.yml b/tests/modules/download/test.yml
index 0d2bde30..54323166 100644
--- a/tests/modules/download/test.yml
+++ b/tests/modules/download/test.yml
@@ -8,7 +8,7 @@
- path: output/download/PGS000001_hmPOS_GRCh37.txt.gz
- path: output/download/versions.yml
contains:
- - "pgscatalog.core: 0.2.1"
+ - "pgscatalog.core: 0.2.2"
- name: pgscatalog test --efo_trait --pgp_id and --pgs_id
command: nextflow run ./tests/modules/download -entry testmultipleaccessions -c ./tests/config/nextflow.config
@@ -24,7 +24,7 @@
- path: output/download/PGS002054_hmPOS_GRCh37.txt.gz
- path: output/download/versions.yml
contains:
- - "pgscatalog.core: 0.2.1"
+ - "pgscatalog.core: 0.2.2"
- name: pgscatalog test bad accession
command: nextflow run ./tests/modules/download -entry testbadaccession -c ./tests/config/nextflow.config
diff --git a/tests/modules/match/test.yml b/tests/modules/match/test.yml
index 5db1804e..d0c41127 100644
--- a/tests/modules/match/test.yml
+++ b/tests/modules/match/test.yml
@@ -7,7 +7,7 @@
files:
- path: output/test/match/versions.yml
contains:
- - "pgscatalog.match: 0.2.2"
+ - "pgscatalog.match: 0.2.3"
# can't test IPC output (not published)
- name: test match combine module
@@ -20,7 +20,7 @@
files:
- path: output/test/match/versions.yml
contains:
- - "pgscatalog.match: 0.2.2"
+ - "pgscatalog.match: 0.2.3"
- path: output/test/match/test_ALL_additive_0.scorefile.gz
contains:
- "PGS001229"
diff --git a/tests/subworkflows/test_apply_score.py b/tests/subworkflows/test_apply_score.py
index 5575cf2f..6453caff 100644
--- a/tests/subworkflows/test_apply_score.py
+++ b/tests/subworkflows/test_apply_score.py
@@ -19,7 +19,7 @@ def test_aggregated_scores(workflow_dir):
assert not df.isnull().any().any(), "Missing values in aggregated scores"
- cols = ["sampleset", "IID", "PGS", "SUM", "DENOM", "AVG"]
+ cols = ["sampleset", "FID", "IID", "PGS", "SUM", "DENOM", "AVG"]
assert cols == list(df.columns), "Missing columns"
assert (
len(
diff --git a/tests/subworkflows/test_apply_score.yml b/tests/subworkflows/test_apply_score.yml
index 637f37cf..9c322606 100644
--- a/tests/subworkflows/test_apply_score.yml
+++ b/tests/subworkflows/test_apply_score.yml
@@ -16,6 +16,9 @@
- "IID"
- "PGS"
- "SUM"
+ - path: output/score/versions.yml
+ contains:
+ - "pgscatalog.calc: 0.2.2"
must_not_contain:
- "percentile_MostSimilarPop"
- "Z_MostSimilarPop"
diff --git a/tests/subworkflows/test_liftover_run.yml b/tests/subworkflows/test_liftover_run.yml
index 1aa3f90a..e02c2010 100644
--- a/tests/subworkflows/test_liftover_run.yml
+++ b/tests/subworkflows/test_liftover_run.yml
@@ -11,7 +11,7 @@
- "5297845"
- path: output/combine/versions.yml
contains:
- - "pgscatalog.core: 0.2.1"
+ - "pgscatalog.core: 0.2.2"
- name: test input check subworkflow with liftover 37to38
command: nextflow run main.nf --only_input --scorefile ./assets/examples/scorefiles/customgrch37.txt --liftover --target_build GRCh38 -c ./tests/config/nextflow.config --hg19_chain https://hgdownload.cse.ucsc.edu/goldenpath/hg19/liftOver/hg19ToHg38.over.chain.gz --hg38_chain https://hgdownload.soe.ucsc.edu/goldenPath/hg38/liftOver/hg38ToHg19.over.chain.gz
@@ -26,4 +26,4 @@
- "5237785"
- path: output/combine/versions.yml
contains:
- - "pgscatalog.core: 0.2.1"
+ - "pgscatalog.core: 0.2.2"
diff --git a/workflows/pgsc_calc.nf b/workflows/pgsc_calc.nf
index 8a57e22e..7bded6e6 100644
--- a/workflows/pgsc_calc.nf
+++ b/workflows/pgsc_calc.nf
@@ -170,7 +170,16 @@ workflow PGSCCALC {
// make sure accessions look sensible before querying PGS Catalog
def pgs_id = WorkflowPgscCalc.prepareAccessions(params.pgs_id, "pgs_id")
def pgp_id = WorkflowPgscCalc.prepareAccessions(params.pgp_id, "pgp_id")
- def trait_efo = WorkflowPgscCalc.prepareAccessions(params.trait_efo, "trait_efo")
+
+ // temporarily handle parameter synonym (--trait_efo -> --efo_id)
+ def traits = [params.trait_efo, params.efo_id].findAll { it != null }.join(",")
+
+ if (params.trait_efo) {
+ println "WARNING: --trait_efo is deprecated and will be removed in a future release, please use --efo_id"
+ }
+
+ def trait_efo = WorkflowPgscCalc.prepareAccessions(traits, "trait_efo")
+
def accessions = pgs_id + pgp_id + trait_efo
if (!accessions.every { it.value == "" }) {