Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
albertodescalzo committed Jan 22, 2024
2 parents 470dac0 + fc24c5b commit b5ae92d
Show file tree
Hide file tree
Showing 22 changed files with 1,615 additions and 866 deletions.
14 changes: 11 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
**/dev_test
**/evaluation_old
**/testing
**/results
**/data/callset/
**/data/callset-and-graph/
**/data/downloaded/
Expand All @@ -14,6 +15,13 @@
**/data/prepared-datasets/
**/ubuntu-docker_testing
**/playbook_denbi
**/playbook_denbi_working
denbi_gg/roles/github-ssh/
denbi_gg/roles/mamba-snakemake/
**/external-eval
**/external-eval_old
**/external-eval_dev
**/external-eval_*
**/reduced-data
**/__pycache__
**/results_old
**/logs
**/preprocessing
**/genotyping
Empty file modified denbi_gg/test_software.sh
100644 → 100755
Empty file.
4 changes: 2 additions & 2 deletions evaluation_pipeline/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -194,11 +194,11 @@ Regarding the pipeline execution, we splitted the pipeline into two workflows to

The download data workflow can be run using the following command:

`` snakemake download_data --use-conda --cores 28 ``
`` snakemake --profile workflow/profiles download_data ``

The leave-one-out workflow can be run using the following command:

`` snakemake leave_one_out --use-conda --cores 28 ``
`` snakemake --profile workflow/profiles leave_one_out ``

## Acknowledgements

Expand Down
40 changes: 26 additions & 14 deletions evaluation_pipeline/config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,20 +8,32 @@ callsets:
bi: "data/downloaded/vcf/HGSVC-GRCh38/Callset_freeze3_64haplotypes.vcf.gz"
# reference genome in FASTA format
reference: "data/downloaded/fasta/GRCh38_full_analysis_set_plus_decoy_hla.fa"
# variants contained in the callset. Options are: snp|indels|large-deletion|large-insertion
variants:
- snp
- indels
- large-deletion
- large-insertion
# repeat annotations in BED format
repeat_regions: "resources/ucsc-simple-repeats.bed"
# samples to run a leave-one-out experiment on
leave_one_out_samples: # specify the samples to evaluate on
- NA24385



reference_fai: "data/downloaded/fasta/GRCh38_full_analysis_set_plus_decoy_hla.fa.fai"
leave1out: # specify the samples to evaluate on
NA24385:
regions: # options: all, multi, bi
- all
- multi
- bi
filters: # options: all, typable
- typable
vartype: # variants contained in the callset. Options are: snp|indels|large-deletion|large-insertion
- snp
- indels
- large-deletion
- large-insertion
external: # specify the samples to (externally) evaluate on
NA24385:
path: "data/downloaded/vcf/giab/hg38/HG002_GRCh38_1_22_v4.2.1_benchmark.vcf.gz" # benchmark dataset used a truth for external evaluation
callable_regions: "data/downloaded/vcf/giab/hg38/HG002_GRCh38_1_22_v4.2.1_benchmark.bed" # path to BED with callable regions
vartype: # options: snp-indel, sv
- snp-indel
- sv
regions: # options: all, multi, bi
- all
filters: # options: all, typable
- typable


# read data. File required that specifies a sample name, path to FASTA/FASTQ data and superpopulation:
# FamilyID SampleID FatherID MotherID Population Superpopulation Sample_Illumina
Expand Down
70 changes: 70 additions & 0 deletions evaluation_pipeline/config/config_reduced-data.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@

### callsets to be used as input to genotyping tools (multiple ones can be listed)
callsets:
HGSVC:
# multi-allelic VCF file
multi: "reduced-data/Pangenome_graph_freeze3_64haplotypes.vcf.gz"
# bi-allelic VCF file
bi: "reduced-data/Callset_freeze3_64haplotypes.vcf.gz"
# reference genome in FASTA format
reference: "reduced-data/GRCh38_full_analysis_set_plus_decoy_hla.fa"
reference_fai: "reduced-data/GRCh38_full_analysis_set_plus_decoy_hla.fa.fai"
leave1out: # specify the samples to evaluate on
NA24385:
regions: # options: all, multi, bi
- all
- multi
- bi
filters: # options: all, typable
- typable
vartype: # variants contained in the callset. Options are: snp|indels|large-deletion|large-insertion
- snp
- indels
- large-deletion
- large-insertion
external: # specify the samples to (externally) evaluate on
NA24385:
path: "reduced-data/HG002_GRCh38_1_22_v4.2.1_benchmark.vcf.gz" # benchmark dataset used a truth for external evaluation
callable_regions: "data/downloaded/vcf/giab/hg38/HG002_GRCh38_1_22_v4.2.1_benchmark.bed" # path to BED with callable regions
vartype: # options: snp-indel, sv
- snp-indel
- sv
regions: # options: all, multi, bi
- all
filters: # options: all, typable
- typable


# read data. File required that specifies a sample name, path to FASTA/FASTQ data and superpopulation:
# FamilyID SampleID FatherID MotherID Population Superpopulation Sample_Illumina
reads: "resources/reads_reduced.tsv"


# PanGenie command. Different versions can be run by listing several commandlines
pangenie: {}


# PanGenie command to be used for modularised versions of PanGenie
pangenie-modules:
pangenie.v3: "/usr/local/bin/PanGenie_v3.0.0"


# Downsampling coverages for leave-one-out experiment. If reads shall not be downsampled, leave empty.
downsampling: []


# Other programs
programs:
rtg: "/home/ubuntu/rtg-tools-3.12.1/rtg"
bwa: "/usr/bin/bwa"
bayestyper: "/usr/local/bin/bayesTyper"
bayestyper_tools: "/usr/local/bin/bayesTyperTools"
graphtyper: "/usr/local/bin/graphtyper"
kmc: "/usr/bin/kmc"
truvari: "/usr/local/bin/truvari"


# Other parameters
utils:
bayestyper_reference_canon: "data/downloaded/bayestyper_utils/bayestyper_GRCh38_bundle_v1.3/GRCh38_canon.fa"
bayestyper_reference_decoy: "data/downloaded/bayestyper_utils/bayestyper_GRCh38_bundle_v1.3/GRCh38_decoy.fa"
Empty file removed evaluation_pipeline/python3
Empty file.
3 changes: 3 additions & 0 deletions evaluation_pipeline/resources/reads_reduced.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#FamilyID SampleID FatherID MotherID Sex Population Superpopulation SampleIllumina
NA24385 NA24385 0 0 1 UNKNOWN UNKNOWN reduced-data/NA24385_raw.fastq.gz
1463B NA12878 NA12891 NA12892 2 CEU EUR reduced-data/NA12878_raw.fastq.gz
58 changes: 46 additions & 12 deletions evaluation_pipeline/workflow/Snakefile
Original file line number Diff line number Diff line change
@@ -1,20 +1,54 @@
configfile: 'config/config.yaml'
include: 'rules/download-data.smk'
include: 'rules/downsample-reads.smk'
include: 'rules/leave-one-out-experiments.smk'
include: 'rules/preprocessing.smk'
include: 'rules/genotyping.smk'
include: 'rules/evaluation.smk'

coverages = ['full'] + config['downsampling']

callsets = [c for c in config["callsets"].keys()]
chromosomes = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "X"]
versions_to_run = [v for v in config['pangenie'].keys()] + [v for v in config['pangenie-modules'].keys()] + ['bayestyper', 'graphtyper']

chromosomes = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "X"]

samples_leave1out = [t for c in config["callsets"].keys() for t in config["callsets"][c]["leave1out"].keys()]
samples_external = [t for c in config["callsets"].keys() for t in config["callsets"][c]["external"].keys()]
vartypes_leave1out = [var for c in config["callsets"].keys() for t in config["callsets"][c]["leave1out"].keys() for var in config["callsets"][c]["leave1out"][t]["vartype"]]
vartypes_external = [var for c in config["callsets"].keys() for t in config["callsets"][c]["external"].keys() for var in config["callsets"][c]["external"][t]["vartype"]]
filters_leave1out = [f for c in config["callsets"].keys() for t in config["callsets"][c]["leave1out"].keys() for f in config["callsets"][c]["leave1out"][t]["filters"]]
filters_external = [f for c in config["callsets"].keys() for t in config["callsets"][c]["external"].keys() for f in config["callsets"][c]["external"][t]["filters"]]
regions_leave1out = [r for c in config["callsets"].keys() for t in config["callsets"][c]["leave1out"].keys() for r in config["callsets"][c]["leave1out"][t]["regions"]]
regions_external = [r for c in config["callsets"].keys() for t in config["callsets"][c]["external"].keys() for r in config["callsets"][c]["external"][t]["regions"]]

pipelines = ['leave1out', 'external'] ## possibilities: leave1out, external

rule all:
input:
expand("results/leave-one-out/{callset}/plots/resources/resources_{callset}-{coverage}.pdf", callset = [c for c in config['callsets'].keys()], coverage = coverages),
expand("results/leave-one-out/{callset}/plots/comparison-versions/{metric}/{metric}_{coverage}_{regions}.pdf", callset = [c for c in config['callsets'].keys()], metric = ['concordance', 'precision-recall-typable', 'untyped', 'concordance-vs-untyped'], coverage = coverages, regions = ['biallelic', 'multiallelic'])

rule leave_one_out:
input:
expand("results/leave-one-out/{callset}/plots/resources/resources_{callset}-{coverage}.pdf", callset = [c for c in config['callsets'].keys()], coverage = coverages),
expand("results/leave-one-out/{callset}/plots/comparison-versions/{metric}/{metric}_{coverage}_{regions}.pdf", callset = [c for c in config['callsets'].keys()], metric = ['concordance', 'precision-recall-typable', 'untyped', 'concordance-vs-untyped'], coverage = coverages, regions = ['biallelic', 'multiallelic']),
input:
expand("results/leave-one-out/{callset}/plots/resources/resources_{callset}-{coverage}.pdf", callset = [c for c in config['callsets'].keys()], coverage = coverages),
expand("results/leave-one-out/{callset}/plots/comparison-versions/{metric}/{metric}_{coverage}_{regions}.pdf", callset = [c for c in config['callsets'].keys()], metric = ['concordance', 'precision-recall-typable', 'untyped', 'concordance-vs-untyped'], coverage = coverages, regions = ['biallelic', 'multiallelic'])


# generate all combinations of desired output files to be produced
def eval_files(wildcards):
filenames = []
for p in pipelines:
if p == 'leave1out':
regions, samples, filters, vartypes = regions_leave1out, samples_leave1out, filters_leave1out, vartypes_leave1out
if p == 'external':
regions, samples, filters, vartypes = regions_external, samples_external, filters_external, vartypes_external
for c in callsets:
for cov in coverages:
for v in versions_to_run:
for r in regions:
for t in samples:
for f in filters:
for var in vartypes:
if var in ['snp', 'indels', 'snp-indel']:
filenames.append("evaluation/" + c + "/" + t + "/" + p + "/" + v + "-" + cov + "-" + f + "-" + r + "-" + var + "/summary.txt")
if var in ['large-insertion', 'large-deletion', 'sv']:
filenames.append("evaluation/" + c + "/" + t + "/" + p + "/" + v + "-" + cov + "-" + f + "-" + r + "-" + var + "/summary.json")

return filenames

rule evaluation:
input:
eval_files
11 changes: 11 additions & 0 deletions evaluation_pipeline/workflow/profile-default/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
cores: 28
latency-wait: 30
keep-going: False
rerun-incomplete: True
restart-times: 0
max-status-checks-per-second: 0.001
use-conda: True
conda-frontend: conda
nolock: False
configfile: config/config.yaml

11 changes: 11 additions & 0 deletions evaluation_pipeline/workflow/profile-reduced-data/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
cores: 28
latency-wait: 30
keep-going: False
rerun-incomplete: True
restart-times: 0
max-status-checks-per-second: 0.001
use-conda: True
conda-frontend: conda
nolock: False
configfile: config/config_reduced-data.yml

46 changes: 39 additions & 7 deletions evaluation_pipeline/workflow/rules/download-data.smk
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
# stores paths to reads
reads_leave_one_out = {}
read_paths = {}

for line in open(config['reads'], 'r'):
if line.startswith('#'):
continue
fields = line.strip().split()
sample_id = fields[2]
sample_id = fields[1]
read_path = fields[7]
reads_leave_one_out[sample_id] = read_path
read_paths[sample_id] = read_path


rule download_data:
input:
Expand All @@ -25,10 +26,16 @@ rule download_data:
'data/downloaded/vcf/HGSVC-GRCh38/Callset_freeze3_64haplotypes.vcf.gz',

# reads
expand("{sample_paths}", sample_paths=list(reads_leave_one_out.values())),
expand("{sample_paths}", sample_paths=list(read_paths.values())),

## Download GRCh38 bundle for BayesTyper
"data/downloaded/bayestyper_utils/bayestyper_GRCh38_bundle.tar.gz"
"data/downloaded/bayestyper_utils/bayestyper_GRCh38_bundle.tar.gz",

## Download HG002/NA24385 benchmark external validation dataset
"data/downloaded/vcf/giab/hg38/HG002_GRCh38_1_22_v4.2.1_benchmark.vcf.gz",
"data/downloaded/vcf/giab/hg38/HG002_GRCh38_1_22_v4.2.1_benchmark.vcf.gz.tbi",
"data/downloaded/vcf/giab/hg38/HG002_GRCh38_1_22_v4.2.1_benchmark.bed"




Expand Down Expand Up @@ -183,7 +190,7 @@ rule bwa_index_fasta:
output:
"data/downloaded/fasta/{filename}.fa" + ".ann"
log:
"data/downloaded/fasta/{filename}-indexing.log"
"logs/data/downloaded/fasta/{filename}-indexing.log"
resources:
mem_total_mb=5000
shell:
Expand All @@ -194,13 +201,38 @@ rule bwa_index_fasta:
####################### Download utils #######################
####################################################################

### Following data is downloaded from Google Drive, since the original link doesn't work anymore.
### --> See issue: https://github.com/bioinformatics-centre/BayesTyper/issues/48

rule download_BayesTyper_GRCh38_bundle:
output:
compressed_bundle="data/downloaded/bayestyper_utils/bayestyper_GRCh38_bundle.tar.gz",
uncompressed_bundle=directory("data/downloaded/bayestyper_utils")
params:
file_id = "1ioTjLFkfmvOMsXubJS5_rwpfajPv5G1Q"
shell:
"""
wget -O {output.compressed_bundle} http://people.binf.ku.dk/~lassemaretty/bayesTyper/bayestyper_GRCh38_bundle.tar.gz
wget --load-cookies /tmp/cookies.txt \
"https://drive.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate \
'https://drive.google.com/uc?export=download&id={params.file_id}' -O- | \
sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\\1\\n/p')&id={params.file_id}" \
-O {output.compressed_bundle} && rm -rf /tmp/cookies.txt
tar -xvf {output.compressed_bundle} -C {output.uncompressed_bundle}
"""


####################################################################
##### Download benchmark external validation dataset #########
####################################################################

# download GIAB benchmark VCF file for HG002_NA24385
rule download_GIAB_bechmark_HG002:
output:
vcf="data/downloaded/vcf/giab/hg38/HG002_GRCh38_1_22_v4.2.1_benchmark.vcf.gz",
tbi="data/downloaded/vcf/giab/hg38/HG002_GRCh38_1_22_v4.2.1_benchmark.vcf.gz.tbi",
bed="data/downloaded/vcf/giab/hg38/HG002_GRCh38_1_22_v4.2.1_benchmark.bed"
run:
shell("wget -O {output.vcf} https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/release/AshkenazimTrio/HG002_NA24385_son/latest/GRCh38/HG002_GRCh38_1_22_v4.2.1_benchmark.vcf.gz")
shell("wget -O {output.tbi} https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/release/AshkenazimTrio/HG002_NA24385_son/latest/GRCh38/HG002_GRCh38_1_22_v4.2.1_benchmark.vcf.gz.tbi")
shell("wget -O {output.bed} https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/release/AshkenazimTrio/HG002_NA24385_son/latest/GRCh38/HG002_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed")
Loading

0 comments on commit b5ae92d

Please sign in to comment.