Merge branch 'master' of https://github.com/DiltheyLab/graph-genome-w…

…orkbench
DiltheyLab · Jan 22, 2024 · b5ae92d · b5ae92d
2 parents 470dac0 + fc24c5b
commit b5ae92d
Show file tree

Hide file tree

Showing 22 changed files with 1,615 additions and 866 deletions.
diff --git a/.gitignore b/.gitignore
@@ -4,6 +4,7 @@
 **/dev_test
 **/evaluation_old
 **/testing
+**/results
 **/data/callset/
 **/data/callset-and-graph/
 **/data/downloaded/
@@ -14,6 +15,13 @@
 **/data/prepared-datasets/
 **/ubuntu-docker_testing
 **/playbook_denbi
-**/playbook_denbi_working
-denbi_gg/roles/github-ssh/
-denbi_gg/roles/mamba-snakemake/
+**/external-eval
+**/external-eval_old
+**/external-eval_dev
+**/external-eval_*
+**/reduced-data
+**/__pycache__
+**/results_old
+**/logs
+**/preprocessing
+**/genotyping
diff --git a/denbi_gg/test_software.sh b/denbi_gg/test_software.sh
diff --git a/evaluation_pipeline/README.md b/evaluation_pipeline/README.md
@@ -194,11 +194,11 @@ Regarding the pipeline execution, we splitted the pipeline into two workflows to
 
 The download data workflow can be run using the following command:
 
-``  snakemake download_data --use-conda --cores 28  `` 
+``  snakemake --profile workflow/profiles download_data  `` 
 
 The leave-one-out workflow can be run using the following command:
 
-``  snakemake leave_one_out --use-conda --cores 28  ``
+``  snakemake --profile workflow/profiles leave_one_out  ``
 
 ## Acknowledgements
 

diff --git a/evaluation_pipeline/config/config.yaml b/evaluation_pipeline/config/config.yaml
@@ -8,20 +8,32 @@ callsets:
         bi: "data/downloaded/vcf/HGSVC-GRCh38/Callset_freeze3_64haplotypes.vcf.gz"
         # reference genome in FASTA format
         reference: "data/downloaded/fasta/GRCh38_full_analysis_set_plus_decoy_hla.fa"
-        # variants contained in the callset. Options are: snp|indels|large-deletion|large-insertion
-        variants:
-            - snp 
-            - indels
-            - large-deletion
-            - large-insertion
-        # repeat annotations in BED format
-        repeat_regions: "resources/ucsc-simple-repeats.bed"
-        # samples to run a leave-one-out experiment on
-        leave_one_out_samples: # specify the samples to evaluate on
-            - NA24385
-
-
-
+        reference_fai: "data/downloaded/fasta/GRCh38_full_analysis_set_plus_decoy_hla.fa.fai"
+        leave1out: # specify the samples to evaluate on
+            NA24385:
+                regions: # options: all, multi, bi
+                    - all
+                    - multi
+                    - bi
+                filters: # options: all, typable
+                    - typable
+                vartype: # variants contained in the callset. Options are: snp|indels|large-deletion|large-insertion
+                    - snp 
+                    - indels
+                    - large-deletion
+                    - large-insertion
+        external: # specify the samples to (externally) evaluate on
+            NA24385: 
+                path: "data/downloaded/vcf/giab/hg38/HG002_GRCh38_1_22_v4.2.1_benchmark.vcf.gz" # benchmark dataset used a truth for external evaluation 
+                callable_regions: "data/downloaded/vcf/giab/hg38/HG002_GRCh38_1_22_v4.2.1_benchmark.bed" # path to BED with callable regions
+                vartype: # options: snp-indel, sv
+                    - snp-indel
+                    - sv
+                regions: # options: all, multi, bi
+                    - all
+                filters: # options: all, typable
+                    - typable
+
 
 # read data. File required that specifies a sample name, path to FASTA/FASTQ data and superpopulation:
 # FamilyID	SampleID	FatherID	MotherID	Population	Superpopulation	Sample_Illumina

diff --git a/evaluation_pipeline/config/config_reduced-data.yml b/evaluation_pipeline/config/config_reduced-data.yml
@@ -0,0 +1,70 @@
+
+### callsets to be used as input to genotyping tools (multiple ones can be listed)
+callsets:
+    HGSVC:
+        # multi-allelic VCF file
+        multi: "reduced-data/Pangenome_graph_freeze3_64haplotypes.vcf.gz"
+        # bi-allelic VCF file
+        bi: "reduced-data/Callset_freeze3_64haplotypes.vcf.gz"
+        # reference genome in FASTA format
+        reference: "reduced-data/GRCh38_full_analysis_set_plus_decoy_hla.fa"
+        reference_fai: "reduced-data/GRCh38_full_analysis_set_plus_decoy_hla.fa.fai"
+        leave1out: # specify the samples to evaluate on
+            NA24385:
+                regions: # options: all, multi, bi
+                    - all
+                    - multi
+                    - bi
+                filters: # options: all, typable
+                    - typable
+                vartype: # variants contained in the callset. Options are: snp|indels|large-deletion|large-insertion
+                    - snp 
+                    - indels
+                    - large-deletion
+                    - large-insertion
+        external: # specify the samples to (externally) evaluate on
+            NA24385: 
+                path: "reduced-data/HG002_GRCh38_1_22_v4.2.1_benchmark.vcf.gz" # benchmark dataset used a truth for external evaluation 
+                callable_regions: "data/downloaded/vcf/giab/hg38/HG002_GRCh38_1_22_v4.2.1_benchmark.bed" # path to BED with callable regions
+                vartype: # options: snp-indel, sv
+                    - snp-indel
+                    - sv
+                regions: # options: all, multi, bi
+                    - all
+                filters: # options: all, typable
+                    - typable
+
+
+# read data. File required that specifies a sample name, path to FASTA/FASTQ data and superpopulation:
+# FamilyID	SampleID	FatherID	MotherID	Population	Superpopulation	Sample_Illumina
+reads: "resources/reads_reduced.tsv"
+
+
+# PanGenie command. Different versions can be run by listing several commandlines
+pangenie: {}
+
+
+# PanGenie command to be used for modularised versions of PanGenie
+pangenie-modules: 
+    pangenie.v3: "/usr/local/bin/PanGenie_v3.0.0"
+
+
+# Downsampling coverages for leave-one-out experiment. If reads shall not be downsampled, leave empty.
+downsampling: []
+
+
+# Other programs
+programs:
+    rtg: "/home/ubuntu/rtg-tools-3.12.1/rtg"
+    bwa: "/usr/bin/bwa"
+    bayestyper: "/usr/local/bin/bayesTyper"
+    bayestyper_tools: "/usr/local/bin/bayesTyperTools"
+    graphtyper: "/usr/local/bin/graphtyper"
+    kmc: "/usr/bin/kmc"
+    truvari: "/usr/local/bin/truvari"
+
+
+# Other parameters
+utils:
+    bayestyper_reference_canon: "data/downloaded/bayestyper_utils/bayestyper_GRCh38_bundle_v1.3/GRCh38_canon.fa"
+    bayestyper_reference_decoy: "data/downloaded/bayestyper_utils/bayestyper_GRCh38_bundle_v1.3/GRCh38_decoy.fa"
diff --git a/evaluation_pipeline/python3 b/evaluation_pipeline/python3
diff --git a/evaluation_pipeline/resources/reads_reduced.tsv b/evaluation_pipeline/resources/reads_reduced.tsv
@@ -0,0 +1,3 @@
+#FamilyID	SampleID	FatherID	MotherID	Sex	Population	Superpopulation	SampleIllumina
+NA24385	NA24385	0	0	1	UNKNOWN	UNKNOWN	reduced-data/NA24385_raw.fastq.gz
+1463B	NA12878	NA12891	NA12892	2	CEU	EUR	reduced-data/NA12878_raw.fastq.gz
diff --git a/evaluation_pipeline/workflow/Snakefile b/evaluation_pipeline/workflow/Snakefile
@@ -1,20 +1,54 @@
-configfile: 'config/config.yaml'
 include: 'rules/download-data.smk'
 include: 'rules/downsample-reads.smk'
-include: 'rules/leave-one-out-experiments.smk'
+include: 'rules/preprocessing.smk'
+include: 'rules/genotyping.smk'
+include: 'rules/evaluation.smk'
 
 coverages = ['full'] + config['downsampling']
-
+callsets = [c for c in config["callsets"].keys()]
+chromosomes = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "X"]
 versions_to_run = [v for v in config['pangenie'].keys()] + [v for v in config['pangenie-modules'].keys()] + ['bayestyper', 'graphtyper']
 
-chromosomes = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "X"]
+
+samples_leave1out = [t for c in config["callsets"].keys() for t in config["callsets"][c]["leave1out"].keys()]
+samples_external = [t for c in config["callsets"].keys() for t in config["callsets"][c]["external"].keys()]
+vartypes_leave1out = [var for c in config["callsets"].keys() for t in config["callsets"][c]["leave1out"].keys() for var in config["callsets"][c]["leave1out"][t]["vartype"]]
+vartypes_external = [var for c in config["callsets"].keys() for t in config["callsets"][c]["external"].keys() for var in config["callsets"][c]["external"][t]["vartype"]]
+filters_leave1out = [f for c in config["callsets"].keys() for t in config["callsets"][c]["leave1out"].keys() for f in config["callsets"][c]["leave1out"][t]["filters"]]
+filters_external = [f for c in config["callsets"].keys() for t in config["callsets"][c]["external"].keys() for f in config["callsets"][c]["external"][t]["filters"]]
+regions_leave1out = [r for c in config["callsets"].keys() for t in config["callsets"][c]["leave1out"].keys() for r in config["callsets"][c]["leave1out"][t]["regions"]]
+regions_external = [r for c in config["callsets"].keys() for t in config["callsets"][c]["external"].keys() for r in config["callsets"][c]["external"][t]["regions"]]
+
+pipelines = ['leave1out', 'external'] ## possibilities: leave1out, external
 
 rule all:
-	input:
-		expand("results/leave-one-out/{callset}/plots/resources/resources_{callset}-{coverage}.pdf", callset = [c for c in config['callsets'].keys()], coverage = coverages),
-		expand("results/leave-one-out/{callset}/plots/comparison-versions/{metric}/{metric}_{coverage}_{regions}.pdf", callset = [c for c in config['callsets'].keys()], metric = ['concordance', 'precision-recall-typable', 'untyped', 'concordance-vs-untyped'], coverage = coverages, regions = ['biallelic', 'multiallelic'])
-
-rule leave_one_out:
-	input:
-		expand("results/leave-one-out/{callset}/plots/resources/resources_{callset}-{coverage}.pdf", callset = [c for c in config['callsets'].keys()], coverage = coverages),
-		expand("results/leave-one-out/{callset}/plots/comparison-versions/{metric}/{metric}_{coverage}_{regions}.pdf", callset = [c for c in config['callsets'].keys()], metric = ['concordance', 'precision-recall-typable', 'untyped', 'concordance-vs-untyped'], coverage = coverages, regions = ['biallelic', 'multiallelic']),
+    input:
+        expand("results/leave-one-out/{callset}/plots/resources/resources_{callset}-{coverage}.pdf", callset = [c for c in config['callsets'].keys()], coverage = coverages),
+        expand("results/leave-one-out/{callset}/plots/comparison-versions/{metric}/{metric}_{coverage}_{regions}.pdf", callset = [c for c in config['callsets'].keys()], metric = ['concordance', 'precision-recall-typable', 'untyped', 'concordance-vs-untyped'], coverage = coverages, regions = ['biallelic', 'multiallelic'])
+
+
+# generate all combinations of desired output files to be produced
+def eval_files(wildcards):
+    filenames = []
+    for p in pipelines:
+        if p == 'leave1out':
+            regions, samples, filters, vartypes = regions_leave1out, samples_leave1out, filters_leave1out, vartypes_leave1out
+        if p == 'external':
+            regions, samples, filters, vartypes = regions_external, samples_external, filters_external, vartypes_external           
+        for c in callsets:
+            for cov in coverages:
+                for v in versions_to_run:
+                    for r in regions:
+                        for t in samples:
+                            for f in filters:
+                                for var in vartypes:
+                                    if var in ['snp', 'indels', 'snp-indel']:
+                                        filenames.append("evaluation/" + c + "/" + t + "/" + p + "/" + v + "-" + cov + "-" + f + "-" + r + "-" + var + "/summary.txt")
+                                    if var in ['large-insertion', 'large-deletion', 'sv']:
+                                        filenames.append("evaluation/" + c + "/" + t + "/" + p + "/" + v + "-" + cov + "-" + f + "-" + r + "-" + var + "/summary.json")
+
+    return filenames
+
+rule evaluation:
+    input:
+        eval_files
diff --git a/evaluation_pipeline/workflow/profile-default/config.yaml b/evaluation_pipeline/workflow/profile-default/config.yaml
@@ -0,0 +1,11 @@
+cores: 28
+latency-wait: 30
+keep-going: False
+rerun-incomplete: True
+restart-times: 0
+max-status-checks-per-second: 0.001
+use-conda: True
+conda-frontend: conda
+nolock: False
+configfile: config/config.yaml
+
diff --git a/evaluation_pipeline/workflow/profile-reduced-data/config.yaml b/evaluation_pipeline/workflow/profile-reduced-data/config.yaml
@@ -0,0 +1,11 @@
+cores: 28
+latency-wait: 30
+keep-going: False
+rerun-incomplete: True
+restart-times: 0
+max-status-checks-per-second: 0.001
+use-conda: True
+conda-frontend: conda
+nolock: False
+configfile: config/config_reduced-data.yml
+
diff --git a/evaluation_pipeline/workflow/rules/download-data.smk b/evaluation_pipeline/workflow/rules/download-data.smk
@@ -1,13 +1,14 @@
 # stores paths to reads
-reads_leave_one_out = {}
+read_paths = {}
 
 for line in open(config['reads'], 'r'):
 	if line.startswith('#'):
 		continue
 	fields = line.strip().split()
-	sample_id = fields[2]
+	sample_id = fields[1]
 	read_path = fields[7]
-	reads_leave_one_out[sample_id] = read_path
+	read_paths[sample_id] = read_path
+
 
 rule download_data:
     input:
@@ -25,10 +26,16 @@ rule download_data:
         'data/downloaded/vcf/HGSVC-GRCh38/Callset_freeze3_64haplotypes.vcf.gz',
 
         # reads
-        expand("{sample_paths}", sample_paths=list(reads_leave_one_out.values())),
+        expand("{sample_paths}", sample_paths=list(read_paths.values())),
 
         ## Download GRCh38 bundle for BayesTyper
-        "data/downloaded/bayestyper_utils/bayestyper_GRCh38_bundle.tar.gz"
+        "data/downloaded/bayestyper_utils/bayestyper_GRCh38_bundle.tar.gz",
+
+        ## Download HG002/NA24385 benchmark external validation dataset
+        "data/downloaded/vcf/giab/hg38/HG002_GRCh38_1_22_v4.2.1_benchmark.vcf.gz",
+        "data/downloaded/vcf/giab/hg38/HG002_GRCh38_1_22_v4.2.1_benchmark.vcf.gz.tbi",
+        "data/downloaded/vcf/giab/hg38/HG002_GRCh38_1_22_v4.2.1_benchmark.bed"
+
 
 
 
@@ -183,7 +190,7 @@ rule bwa_index_fasta:
 	output:
 		"data/downloaded/fasta/{filename}.fa" + ".ann"
 	log:
-		"data/downloaded/fasta/{filename}-indexing.log"
+		"logs/data/downloaded/fasta/{filename}-indexing.log"
 	resources:
 		mem_total_mb=5000
 	shell:
@@ -194,13 +201,38 @@ rule bwa_index_fasta:
 #######################   Download utils     #######################
 ####################################################################
 
+### Following data is downloaded from Google Drive, since the original link doesn't work anymore. 
+### --> See issue: https://github.com/bioinformatics-centre/BayesTyper/issues/48
+
 rule download_BayesTyper_GRCh38_bundle:
     output:
         compressed_bundle="data/downloaded/bayestyper_utils/bayestyper_GRCh38_bundle.tar.gz", 
         uncompressed_bundle=directory("data/downloaded/bayestyper_utils")
+    params:
+        file_id = "1ioTjLFkfmvOMsXubJS5_rwpfajPv5G1Q"
     shell:
         """
-        wget -O {output.compressed_bundle} http://people.binf.ku.dk/~lassemaretty/bayesTyper/bayestyper_GRCh38_bundle.tar.gz 
+        wget --load-cookies /tmp/cookies.txt \
+        "https://drive.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate \
+        'https://drive.google.com/uc?export=download&id={params.file_id}' -O- | \
+        sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\\1\\n/p')&id={params.file_id}" \
+        -O {output.compressed_bundle} && rm -rf /tmp/cookies.txt
+        
         tar -xvf {output.compressed_bundle} -C {output.uncompressed_bundle} 
         """
 
+
+####################################################################
+#####   Download benchmark external validation dataset     #########
+####################################################################
+
+# download GIAB benchmark VCF file for HG002_NA24385
+rule download_GIAB_bechmark_HG002:
+	output:
+		vcf="data/downloaded/vcf/giab/hg38/HG002_GRCh38_1_22_v4.2.1_benchmark.vcf.gz",
+		tbi="data/downloaded/vcf/giab/hg38/HG002_GRCh38_1_22_v4.2.1_benchmark.vcf.gz.tbi",
+		bed="data/downloaded/vcf/giab/hg38/HG002_GRCh38_1_22_v4.2.1_benchmark.bed"
+	run:
+		shell("wget -O {output.vcf} https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/release/AshkenazimTrio/HG002_NA24385_son/latest/GRCh38/HG002_GRCh38_1_22_v4.2.1_benchmark.vcf.gz")
+		shell("wget -O {output.tbi} https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/release/AshkenazimTrio/HG002_NA24385_son/latest/GRCh38/HG002_GRCh38_1_22_v4.2.1_benchmark.vcf.gz.tbi")
+		shell("wget -O {output.bed} https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/release/AshkenazimTrio/HG002_NA24385_son/latest/GRCh38/HG002_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed")