diff --git a/.tests/integration/config/config.yaml b/.tests/integration/config/config.yaml index 76d5595..e52a5d7 100644 --- a/.tests/integration/config/config.yaml +++ b/.tests/integration/config/config.yaml @@ -7,7 +7,7 @@ units: "config/units.tsv" output: "config/output_list.json" -default_container: "docker://hydragenetics/common:0.1.9" +default_container: "docker://hydragenetics/common:1.11.1" modules: alignment: "v0.6.0" @@ -30,6 +30,9 @@ reference: fasta: "reference/homo_sapiens.fasta" fai: "reference/homo_sapiens.fasta.fai" genepanels: "reference/genepanels.list" + str_panels_dir: "reference/str_panels" + str_panels: + - "ataxia.list" sites: "reference/homo_sapiens.known_indels.vcf.gz" wgs_intervals: "reference/homo_sapiens.wgs.interval_list" merge_contigs: # contigs to be merged to a single BAM for mark duplicates @@ -218,7 +221,7 @@ mosdepth_bed: extra: "" multiqc: - container: "docker://hydragenetics/multiqc:1.11" + container: "docker://hydragenetics/multiqc:1.21" reports: DNA: config: "config/multiqc_config_DNA.yaml" @@ -355,4 +358,4 @@ verifybamid2: svd_mu: "reference/1000g.phase3.100k.b38.vcf.gz.dat.mu" vt_decompose: - container: "docker://hydragenetics/vt:2015.11.10" \ No newline at end of file + container: "docker://hydragenetics/vt:2015.11.10" diff --git a/.tests/integration/config/multiqc_config_DNA.yaml b/.tests/integration/config/multiqc_config_DNA.yaml index 199af5d..db67396 100644 --- a/.tests/integration/config/multiqc_config_DNA.yaml +++ b/.tests/integration/config/multiqc_config_DNA.yaml @@ -69,18 +69,16 @@ table_columns_visible: sex_het_ratio: False error_sex_check: True predicted_sex_sex_check: True - Picard: - PCT_PF_READS_ALIGNED: False + "Picard: HsMetrics": + FOLD_ENRICHMENT: False + MEDIAN_TARGET_COVERAGE: False + PCT_TARGET_BASES_30X: False + "Picard: InsertSizeMetrics": summed_median: False summed_mean: True + "Picard: Mark Duplicates": PERCENT_DUPLICATION: True - MEDIAN_COVERAGE: False - MEAN_COVERAGE: False - SD_COVERAGE: False - PCT_30X: False - PCT_TARGET_BASES_30X: False - FOLD_ENRICHMENT: False - Samtools: + "Samtools: stats": error_rate: False non-primary_alignments: False reads_mapped: True @@ -91,7 +89,7 @@ table_columns_visible: # Patriks plug in, addera egna columner till general stats multiqc_cgs: - Picard: + "Picard: HsMetrics": FOLD_80_BASE_PENALTY: title: "Fold80" description: "Fold80 penalty from picard hs metrics" @@ -110,21 +108,13 @@ multiqc_cgs: max: 100 scale: "RdYlGn-rev" format: "{:.2%}" - Samtools: + "Samtools: stats": average_quality: title: "Average Quality" description: "Ratio between the sum of base qualities and total length from Samtools stats" min: 0 max: 60 scale: "RdYlGn" - mosdepth: - 20_x_pc: #Cant get it to work - title: "20x percent" - description: "Fraction of genome with at least 20X coverage" - max: 100 - min: 0 - suffix: "%" - scale: "RdYlGn" # Galler alla kolumner oberoende pa module! table_columns_placement: @@ -152,15 +142,18 @@ table_columns_placement: error_sex_check: 701 predicted_sex_sex_check: 702 family_id: 703 - Picard: + "Picard: HsMetrics": PCT_SELECTED_BASES: 801 FOLD_80_BASE_PENALTY: 802 PCT_PF_READS_ALIGNED: 888 + ZERO_CVG_TARGETS_PCT: 888 + "Picard: InsertSizeMetrics": summed_median: 888 - PERCENT_DUPLICATION: 803 summed_mean: 804 + "Picard: Mark Duplicates": + PERCENT_DUPLICATION: 803 + Picard: STANDARD_DEVIATION: 805 - ZERO_CVG_TARGETS_PCT: 888 MEDIAN_COVERAGE: 888 MEAN_COVERAGE: 888 SD_COVERAGE: 888 diff --git a/.tests/integration/config/output_list.json b/.tests/integration/config/output_list.json index d494ba7..f0b864a 100644 --- a/.tests/integration/config/output_list.json +++ b/.tests/integration/config/output_list.json @@ -13,8 +13,9 @@ "results/{sample}/cnv_sv/{sample}.svdb_merged.filtered.vcf.gz": {"name": "_copy_svdb_merged_filtered_vcf", "file": "annotate/vep_svdb/{sample}_N.merged.svdb_query.vep_annotated.filtered.vcf.gz", "types": ["N"]}, "results/{sample}/{sample}.upd_regions.bed": {"name": "_copy_upd_regions_bed", "file": "cnv_sv/upd/{sample}_N.upd_regions.bed", "types": ["N"]}, "results/{sample}/{sample}.contamination.html": {"name": "_copy_haplochack_contamination_report", "file": "mitochondrial/haplocheck/{sample}_N.contamination.html", "types": ["N"]}, - "results/{sample}/{sample}.expansionhunter_stranger.vcf.gz": {"name": "_copy_stranger_vcf", "file": "cnv_sv/stranger/{sample}_N.stranger.vcf.gz", "types": ["N"]}, - "results/{sample}/{sample}.expansionhunter_stranger.bed": {"name": "_copy_stranger_bed", "file": "cnv_sv/stranger/{sample}_N.stranger.bed", "types": ["N"]}, + "results/{sample}/expansionhunter/{sample}.expansionhunter_stranger.vcf.gz": {"name": "_copy_stranger_vcf", "file": "cnv_sv/stranger/{sample}_N.stranger.vcf.gz", "types": ["N"]}, + "results/{sample}/expansionhunter/{sample}.expansionhunter_stranger.bed": {"name": "_copy_stranger_bed", "file": "cnv_sv/stranger/{sample}_N.stranger.bed", "types": ["N"]}, + "results/{sample}/expansionhunter/{sample}.expansionhunter_stranger_{panel}.bed": {"name": "_copy_stranger_bed_panel", "file": "cnv_sv/stranger/{sample}_N_{panel}.stranger.bed", "types": ["N"]}, "results/{sample}/{sample}.coverage_analysis.xlsx": {"name": "_copy_coverage_excel", "file": "qc/create_cov_excel/{sample}_N.coverage.xlsx", "types": ["N"]}, "results/{sample}/SMNCopyNumberCaller/{sample}.smn_charts.pdf": {"name": "_copy_smn_pdf", "file": "cnv_sv/smn_charts/smn_{sample}_N.pdf", "types": ["N"]}, "results/{sample}/SMNCopyNumberCaller/{sample}.smn_caller.tsv": {"name": "_copy_smn_tsv", "file": "cnv_sv/smn_caller/{sample}_N.tsv", "types": ["N"]}, @@ -25,5 +26,6 @@ "results/{sample}/{sample}_N.cram": {"name": "_copy_samtools_cram", "file": "compression/samtools_view/{sample}_N.cram", "types": ["N"]}, "results/{sample}/{sample}_N.cram.crai": {"name": "_copy_samtools_crai", "file": "compression/samtools_view/{sample}_N.cram.crai", "types": ["N"]}, "results/{sample}/spring/{sample}_{type}_{flowcell}_{lane}_{barcode}.spring": {"name": "_copy_spring", "file": "compression/spring/{sample}_{type}_{flowcell}_{lane}_{barcode}.spring", "types": ["N"]}, + "results/peddy.html": {"name": "_copy_peddy_html", "file": "qc/peddy/peddy.html", "types": ["N"]}, "results/multiqc_DNA.html": {"name": "_copy_multiqc_html", "file": "qc/multiqc/multiqc_DNA.html", "types": ["N"]} } diff --git a/.tests/integration/reference/str_panels/ataxia.list b/.tests/integration/reference/str_panels/ataxia.list new file mode 100644 index 0000000..e69de29 diff --git a/.tests/integration/reference/str_panels/test.list b/.tests/integration/reference/str_panels/test.list new file mode 100644 index 0000000..e69de29 diff --git a/.tests/integration/results/versions/software_20240610/Poirot__add_software_versions_mqc_versions.yaml b/.tests/integration/results/versions/software_20240610/Poirot__add_software_versions_mqc_versions.yaml new file mode 100644 index 0000000..e69de29 diff --git a/.tests/integration/results/versions/software_20240610/softwares_mqc_versions.yaml b/.tests/integration/results/versions/software_20240610/softwares_mqc_versions.yaml new file mode 100644 index 0000000..e69de29 diff --git a/config/config.yaml b/config/config.yaml index b9c9da1..7052665 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -32,6 +32,9 @@ reference: genepanels: "/beegfs-storage/projects/wp3/Reference_files/Manifest/Clinical_research_exome/Gene_panels/genepanels_WGS.list" sites: "/beegfs-storage/data/ref_genomes/GRCh38/GRCmasked/Homo_sapiens_assembly38.known_indels.vcf.gz" wgs_intervals: "/beegfs-storage/data/ref_genomes/GRCh38/GRCmasked/homo_sapiens.hg38.wgs.interval_list" + str_panels_dir: "config/str_panels" + str_panels: + - "ataxia.list" merge_contigs: # contigs to be merged to a single BAM for mark duplicates - ".*_random" - "chrUn_.*" @@ -217,7 +220,7 @@ mosdepth_bed: extra: "" multiqc: - container: "docker://hydragenetics/multiqc:1.11" + container: "docker://hydragenetics/multiqc:1.21" reports: DNA: config: "config/multiqc_config_DNA.yaml" diff --git a/config/multiqc_config_DNA.yaml b/config/multiqc_config_DNA.yaml index 041c325..be745a4 100644 --- a/config/multiqc_config_DNA.yaml +++ b/config/multiqc_config_DNA.yaml @@ -3,7 +3,7 @@ subtitle: "Reference used: GRCh38" intro_text: "The MultiQC report summarise analysis results from WGS data that been analysed by the pipeline Poirot_RD-WGS (https://github.com/clinical-genomics-uppsala/Poirot_RD-WGS)." report_header_info: - - Contact E-mail: "jessika.nordin@scilifelab.uu.se" + - Contact E-mail: "igp-klinsek-bioinfo@lists.uu.se" - Application Type: "Bioinformatic analysis of WGS for rare diseases" show_analysis_paths: True @@ -12,32 +12,20 @@ decimalPoint_format: ',' extra_fn_clean_exts: ##from this until end - '.dup' - - '_N' - type: regex pattern: '_fastq[12]' # - '_S' extra_fn_clean_trim: - 'Sample_VE-3297_' -#fn_ignore_dirs: -#fn_ignore_files: -#top_modules: -# - "samtools" -# - "picard" -# - "fastqc" - - -# https://github.com/ewels/MultiQC/blob/master/docs/customisation.md - - -custom_content: # Not working - order: - - fastqc - - fastp - - mosdepth - - peddy - - samtools - - picard +module_order: + - fastqc + - fastp + - verifybamid + - mosdepth + - peddy + - samtools + - picard table_columns_visible: FastQC: @@ -70,19 +58,25 @@ table_columns_visible: sex_het_ratio: False error_sex_check: True predicted_sex_sex_check: True - Picard: - TOTAL_READS: False - PCT_PF_READS_ALIGNED: False + "Picard: HsMetrics": + FOLD_ENRICHMENT: False + MEDIAN_TARGET_COVERAGE: False + PCT_TARGET_BASES_30X: False + "Picard: InsertSizeMetrics": summed_median: False summed_mean: True + "Picard: Mark Duplicates": PERCENT_DUPLICATION: True + "Picard: WGSMetrics": + STANDARD_DEVIATION: False MEDIAN_COVERAGE: False MEAN_COVERAGE: False SD_COVERAGE: False PCT_30X: False PCT_TARGET_BASES_30X: False FOLD_ENRICHMENT: False - Samtools: + + "Samtools: stats": error_rate: False non-primary_alignments: False reads_mapped: True @@ -93,10 +87,10 @@ table_columns_visible: # Patriks plug in, addera egna columner till general stats multiqc_cgs: - Picard: + "Picard: HsMetrics": FOLD_80_BASE_PENALTY: title: "Fold80" - description: "Fold80 penalty from picard WGS metrics" + description: "Fold80 penalty from picard hs metrics" min: 1 max: 3 scale: "RdYlGn-rev" @@ -112,26 +106,13 @@ multiqc_cgs: max: 100 scale: "RdYlGn-rev" format: "{:.2%}" - TOTAL_READS: - title: "Total Reads" - description: "Total read in BAM file from Picard" - scale: "RdYlGn-rev" - format: "{:.1f}" - Samtools: + "Samtools: stats": average_quality: title: "Average Quality" description: "Ratio between the sum of base qualities and total length from Samtools stats" min: 0 max: 60 scale: "RdYlGn" - mosdepth: - 20_x_pc: #Cant get it to work - title: "20x percent" - description: "Fraction of genome with at least 20X coverage" - max: 100 - min: 0 - suffix: "%" - scale: "RdYlGn" # Galler alla kolumner oberoende pa module! table_columns_placement: @@ -143,7 +124,7 @@ table_columns_placement: 20_x_pc: 603 30_x_pc: 604 50_x_pc: 605 - Samtools: + "Samtools: stats": raw_total_sequences: 500 reads_mapped: 501 reads_mapped_percent: 502 @@ -159,16 +140,18 @@ table_columns_placement: error_sex_check: 701 predicted_sex_sex_check: 702 family_id: 703 - Picard: - TOTAL_READS: 800 + "Picard: HsMetrics": PCT_SELECTED_BASES: 801 FOLD_80_BASE_PENALTY: 802 PCT_PF_READS_ALIGNED: 888 + ZERO_CVG_TARGETS_PCT: 888 + "Picard: InsertSizeMetrics": summed_median: 888 - PERCENT_DUPLICATION: 803 summed_mean: 804 + "Picard: Mark Duplicates": + PERCENT_DUPLICATION: 803 + "Picard: WGSMetrics": STANDARD_DEVIATION: 805 - ZERO_CVG_TARGETS_PCT: 888 MEDIAN_COVERAGE: 888 MEAN_COVERAGE: 888 SD_COVERAGE: 888 diff --git a/config/output_list.json b/config/output_list.json index 95d4016..f0b864a 100644 --- a/config/output_list.json +++ b/config/output_list.json @@ -13,8 +13,9 @@ "results/{sample}/cnv_sv/{sample}.svdb_merged.filtered.vcf.gz": {"name": "_copy_svdb_merged_filtered_vcf", "file": "annotate/vep_svdb/{sample}_N.merged.svdb_query.vep_annotated.filtered.vcf.gz", "types": ["N"]}, "results/{sample}/{sample}.upd_regions.bed": {"name": "_copy_upd_regions_bed", "file": "cnv_sv/upd/{sample}_N.upd_regions.bed", "types": ["N"]}, "results/{sample}/{sample}.contamination.html": {"name": "_copy_haplochack_contamination_report", "file": "mitochondrial/haplocheck/{sample}_N.contamination.html", "types": ["N"]}, - "results/{sample}/{sample}.expansionhunter_stranger.vcf.gz": {"name": "_copy_stranger_vcf", "file": "cnv_sv/stranger/{sample}_N.stranger.vcf.gz", "types": ["N"]}, - "results/{sample}/{sample}.expansionhunter_stranger.bed": {"name": "_copy_stranger_bed", "file": "cnv_sv/stranger/{sample}_N.stranger.bed", "types": ["N"]}, + "results/{sample}/expansionhunter/{sample}.expansionhunter_stranger.vcf.gz": {"name": "_copy_stranger_vcf", "file": "cnv_sv/stranger/{sample}_N.stranger.vcf.gz", "types": ["N"]}, + "results/{sample}/expansionhunter/{sample}.expansionhunter_stranger.bed": {"name": "_copy_stranger_bed", "file": "cnv_sv/stranger/{sample}_N.stranger.bed", "types": ["N"]}, + "results/{sample}/expansionhunter/{sample}.expansionhunter_stranger_{panel}.bed": {"name": "_copy_stranger_bed_panel", "file": "cnv_sv/stranger/{sample}_N_{panel}.stranger.bed", "types": ["N"]}, "results/{sample}/{sample}.coverage_analysis.xlsx": {"name": "_copy_coverage_excel", "file": "qc/create_cov_excel/{sample}_N.coverage.xlsx", "types": ["N"]}, "results/{sample}/SMNCopyNumberCaller/{sample}.smn_charts.pdf": {"name": "_copy_smn_pdf", "file": "cnv_sv/smn_charts/smn_{sample}_N.pdf", "types": ["N"]}, "results/{sample}/SMNCopyNumberCaller/{sample}.smn_caller.tsv": {"name": "_copy_smn_tsv", "file": "cnv_sv/smn_caller/{sample}_N.tsv", "types": ["N"]}, diff --git a/config/sample_order.tsv b/config/sample_order.tsv new file mode 100644 index 0000000..133418a --- /dev/null +++ b/config/sample_order.tsv @@ -0,0 +1,25 @@ +Sample Order Sample Name +sample_001 D22-07608 +sample_002 D24-03890 +sample_003 D24-03891 +sample_004 D24-03892 +sample_005 D20-05240 +sample_006 D23-09349 +sample_007 D23-09384 +sample_008 D24-01401 +sample_009 D24-01640 +sample_010 D24-02504 +sample_011 D24-03945 +sample_012 D24-03888 +sample_013 D24-03946 +sample_014 D24-03889 +sample_015 D24-03952 +sample_016 D23-08138 +sample_017 D24-03982 +sample_018 D24-04023 +sample_019 D24-04029 +sample_020 D24-04028 +sample_021 D24-03986 +sample_022 D24-03987 +sample_023 D24-03985 +sample_024 D24-04082 diff --git a/config/sample_replacement.tsv b/config/sample_replacement.tsv new file mode 100644 index 0000000..6f6dc85 --- /dev/null +++ b/config/sample_replacement.tsv @@ -0,0 +1,24 @@ +D22-07608 sample_001 +D24-03890 sample_002 +D24-03891 sample_003 +D24-03892 sample_004 +D20-05240 sample_005 +D23-09349 sample_006 +D23-09384 sample_007 +D24-01401 sample_008 +D24-01640 sample_009 +D24-02504 sample_010 +D24-03945 sample_011 +D24-03888 sample_012 +D24-03946 sample_013 +D24-03889 sample_014 +D24-03952 sample_015 +D23-08138 sample_016 +D24-03982 sample_017 +D24-04023 sample_018 +D24-04029 sample_019 +D24-04028 sample_020 +D24-03986 sample_021 +D24-03987 sample_022 +D24-03985 sample_023 +D24-04082 sample_024 diff --git a/config/str_panels/ataxia.list b/config/str_panels/ataxia.list new file mode 100644 index 0000000..172120a --- /dev/null +++ b/config/str_panels/ataxia.list @@ -0,0 +1,15 @@ +ATN1 +ATXN1 +ATXN10 +ATXN2 +ATXN3 +ATXN7 +ATXN8OS +BEAN1 +CACNA1A +DAB1 +FGF14 +FXN +NOP56 +PPP2R2B +TBP diff --git a/docs/requirements.txt b/docs/requirements.txt index 269fe7e..28fc58f 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,6 +1,6 @@ jinja2<3.1.0 mkdocs==1.4.2 -pymdown-extensions==9.10 +pymdown-extensions==10.0 mkdocs-schema-reader==0.11.1 mkdocs-simple-hooks==0.1.5 mdx_spanner==0.0.5 diff --git a/requirements.txt b/requirements.txt index 7fed2a0..92b2ca8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -hydra-genetics==1.11.1 +hydra-genetics==3.0.0 pandas>=1.3.1 pulp<2.8 snakemake==7.26.0 diff --git a/workflow/Snakefile b/workflow/Snakefile index 4b30213..8a6cd9e 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -307,12 +307,12 @@ use rule samtools_index from misc as misc_samtools_index with: input: bam="cnv_sv/expansionhunter/{sample}_{type}_realigned.sorted.bam", output: - bam="cnv_sv/expansionhunter/{sample}_{type}_realigned.sorted.bam.bai", + bam=temp("cnv_sv/expansionhunter/{sample}_{type}_realigned.sorted.bam.bai"), log: - "{sample}_{type}_realigned.sorted.bam.bai.log", + "cnv_sv/expansionhunter/{sample}_{type}_realigned.sorted.bam.bai.log", benchmark: repeat( - "{sample}_{type}_realigned.sorted.bam.bai.benchmark.tsv", + "cnv_sv/expansionhunter/{sample}_{type}_realigned.sorted.bam.bai.benchmark.tsv", config.get("samtools_index", {}).get("benchmark_repeats", 1), ) diff --git a/workflow/rules/bcftools.smk b/workflow/rules/bcftools.smk index c25e280..4b0c71f 100644 --- a/workflow/rules/bcftools.smk +++ b/workflow/rules/bcftools.smk @@ -13,10 +13,10 @@ rule bcftools_split_vep: columns=config.get("bcftools_split_vep", {}).get("columns", "gnomad_AF:Float"), extra=config.get("bcftools_split_vep", {}).get("extra", ""), log: - "poirot_rd_wgs/bcftools_split_vep/{sample}_{type}.output.log", + "annotate/vep_svdb/{sample}_{type}.merged.svdb_query.vep_annotated.vep_info.vcf.log", benchmark: repeat( - "poirot_rd_wgs/bcftools_split_vep/{sample}_{type}.output.benchmark.tsv", + "annotate/vep_svdb/{sample}_{type}.merged.svdb_query.vep_annotated.vep_info.vcf.benchmark.tsv", config.get("bcftools_split_vep", {}).get("benchmark_repeats", 1), ) threads: config.get("bcftools_split_vep", {}).get("threads", config["default_resources"]["threads"]) diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index c9b303d..757f6ce 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -5,6 +5,7 @@ __license__ = "GPL-3" import pandas import yaml +from datetime import datetime from hydra_genetics.utils.misc import get_module_snakefile from hydra_genetics.utils.resources import load_resources @@ -13,6 +14,20 @@ from hydra_genetics.utils.units import * from hydra_genetics.utils.misc import extract_chr from snakemake.utils import min_version from snakemake.utils import validate +from hydra_genetics import min_version as hydra_min_version + +from hydra_genetics.utils.misc import export_config_as_file +from hydra_genetics.utils.software_versions import add_version_files_to_multiqc +from hydra_genetics.utils.software_versions import add_software_version_to_config +from hydra_genetics.utils.software_versions import export_pipeline_version_as_file +from hydra_genetics.utils.software_versions import export_software_version_as_file +from hydra_genetics.utils.software_versions import get_pipeline_version +from hydra_genetics.utils.software_versions import touch_pipeline_version_file_name +from hydra_genetics.utils.software_versions import touch_software_version_file +from hydra_genetics.utils.software_versions import use_container + + +hydra_min_version("3.0.0") min_version("7.8.0") @@ -43,7 +58,6 @@ validate(config, schema="../schemas/resources.schema.yaml") samples = pandas.read_table(config["samples"], dtype=str).set_index("sample", drop=False) validate(samples, schema="../schemas/samples.schema.yaml") - ### Read and validate units file units = ( pandas.read_table(config["units"], dtype=str) @@ -56,9 +70,21 @@ validate(units, schema="../schemas/units.schema.yaml") with open(config["output"]) as output: output_json = json.load(output) -## contigs in hg38 -contigs = extract_chr("%s.fai" % (config.get("reference", {}).get("fasta", "")), filter_out=[]) -skip_contigs = [c for c in contigs if "_" in c or c == "chrEBV"] +## get version information on pipeline, containers and software +date_string = datetime.now().strftime("%Y%m%d") +pipeline_version = get_pipeline_version(workflow, pipeline_name="Poirot") +version_files = touch_pipeline_version_file_name(pipeline_version, date_string=date_string, directory="results/versions/software") +if use_container(workflow): + version_files.append(touch_software_version_file(config, date_string=date_string, directory="results/versions/software")) +add_version_files_to_multiqc(config, version_files) + + +onstart: + export_pipeline_version_as_file(pipeline_version, date_string=date_string, directory="results/versions/software") + if use_container(workflow): + update_config, software_info = add_software_version_to_config(config, workflow, False) + export_software_version_as_file(software_info, date_string=date_string, directory="results/versions/software") + export_config_as_file(update_config, date_string=date_string, directory="results/versions") ### Set wildcard constraints @@ -73,6 +99,10 @@ wildcard_constraints: vcf="vcf|g.vcf|unfiltered.vcf", +## contigs in hg38 +contigs = extract_chr("%s.fai" % (config.get("reference", {}).get("fasta", "")), filter_out=[]) +skip_contigs = [c for c in contigs if "_" in c or c == "chrEBV"] + ### Functions @@ -248,13 +278,16 @@ def compile_output_list(wildcards): else: output_files += set( [ - output.format(sample=sample, flowcell=flowcell, lane=lane, barcode=barcode, type=unit_type) + output.format(sample=sample, flowcell=flowcell, lane=lane, barcode=barcode, type=unit_type, panel=str_panel) for sample in get_samples(samples) for unit_type in get_unit_types(units, sample) if unit_type in set(output_json[output]["types"]) for flowcell in set([u.flowcell for u in units.loc[(sample, unit_type)].dropna().itertuples()]) for barcode in set([u.barcode for u in units.loc[(sample, unit_type)].dropna().itertuples()]) for lane in set([u.lane for u in units.loc[(sample, unit_type)].dropna().itertuples()]) + for str_panel in [ + panel_list.split(".")[0] for panel_list in config.get("reference", {}).get("str_panels", "") + ] ] ) diff --git a/workflow/rules/extract_str_bed.smk b/workflow/rules/extract_str_bed.smk index dc9a847..7cb2bab 100644 --- a/workflow/rules/extract_str_bed.smk +++ b/workflow/rules/extract_str_bed.smk @@ -28,3 +28,20 @@ rule extract_str_bed: "{rule}: Convert stranger annotated {input.vcf} to bed file format" script: "../scripts/extract_str_bed.py" + + +use rule extract_str_bed as extract_str_bed_panel with: + input: + vcf="cnv_sv/stranger/{sample}_{type}.stranger.vcf", + panel_list=expand("{panel_dir}/{{panel}}.list", panel_dir=config.get("reference", {}).get("str_panels_dir", "")), + output: + bed=temp("cnv_sv/stranger/{sample}_{type}_{panel}.stranger.bed"), + log: + "cnv_sv/stranger/{sample}_{type}_{panel}.stranger.bed.log", + benchmark: + repeat( + "cnv_sv/stranger/{sample}_{type}_{panel}.stranger.bed.benchmark.tsv", + config.get("extract_str_bed_panel", {}).get("benchmark_repeats", 1), + ) + message: + "{rule}: Convert stranger annotated {input.vcf} to bed file format for loci in {input.panel_list}" diff --git a/workflow/rules/fix_sv_header.smk b/workflow/rules/fix_sv_header.smk index 3a7bc8d..522fc9b 100644 --- a/workflow/rules/fix_sv_header.smk +++ b/workflow/rules/fix_sv_header.smk @@ -12,10 +12,10 @@ rule fix_sv_header: params: extra=config.get("fix_sv_header", {}).get("extra", ""), log: - "poirot_rd_wgs/fix_sv_header/{sample}_{type}.output.log", + "annotate/vep_svdb/{sample}_{type}.merged.svdb_query.vep_annotated.fixed_header.vcf.log", benchmark: repeat( - "poirot_rd_wgs/fix_sv_header/{sample}_{type}.output.benchmark.tsv", + "annotate/vep_svdb/{sample}_{type}.merged.svdb_query.vep_annotated.fixed_header.vcf.benchmark.tsv", config.get("fix_sv_header", {}).get("benchmark_repeats", 1), ) threads: config.get("fix_sv_header", {}).get("threads", config["default_resources"]["threads"]) diff --git a/workflow/scripts/extract_str_bed.py b/workflow/scripts/extract_str_bed.py index 5600148..b143fa4 100644 --- a/workflow/scripts/extract_str_bed.py +++ b/workflow/scripts/extract_str_bed.py @@ -3,6 +3,9 @@ loci_to_filter = ["HTT", "HTT_CCG", "C9ORF72"] vcf = pysam.VariantFile(snakemake.input.vcf) +if snakemake.input.panel_list is not None: + loci_to_keep = [locus.rstrip() for locus in open( + snakemake.input.panel_list, 'r')] def get_bed_rec(rec): @@ -37,6 +40,9 @@ def get_bed_rec(rec): str_pathologic_min", file=outfile) for rec in vcf: bed_rec = get_bed_rec(rec) - - if rec.info["VARID"] not in loci_to_filter: - print(bed_rec, file=outfile) + if snakemake.input.panel_list is not None: + if rec.info["VARID"] in loci_to_keep: + print(bed_rec, file=outfile) + else: + if rec.info["VARID"] not in loci_to_filter: + print(bed_rec, file=outfile)