From 8ec19866f7ddafee9e249f37940661cd8682be7c Mon Sep 17 00:00:00 2001
From: Arielle R Munters <arielle@reivant.se>
Date: Tue, 15 Oct 2024 10:40:23 +0200
Subject: [PATCH 1/7] refactor update multiqc version:

---
 config/config.yaml                       |   6 +-
 config/multiqc_dna_config.yaml           | 106 ++++++++++++-----------
 config/multiqc_rna_config.yaml           | 100 +++++++++++++++------
 workflow/Snakefile                       |   8 +-
 workflow/scripts/sample_order_multiqc.py |   2 +-
 5 files changed, 140 insertions(+), 82 deletions(-)

diff --git a/config/config.yaml b/config/config.yaml
index 2dde122..02c0c41 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -14,7 +14,7 @@ modules:
   fusions: "12f8354"
   parabricks: "v1.1.0"
   prealignment: "v1.1.0"
-  qc: "v0.3.0"
+  qc: "53c3a82" #"v0.3.0"
   reports: "7c8b8c5"
   misc: "v0.1.0"
   sentieon: "b002d39"
@@ -164,7 +164,7 @@ mosdepth_bed:
   design_bed: ""
 
 multiqc:
-  container: "docker://hydragenetics/multiqc:1.11"
+  container: "docker://hydragenetics/multiqc:1.21"
   reports:
     DNA:
       config: "config/multiqc_dna_config.yaml"
@@ -214,6 +214,8 @@ multiqc:
         - "prealignment/sortmerna/{sample}_{type}.rrna.log"
         - "qc/multiqc/RNA_number.table.tsv"
         - "qc/picard_collect_alignment_summary_metrics/{sample}_{type}.alignment_summary_metrics.txt"
+        - "alignment/star/{sample}_{type}.Log.final.out"
+        - "alignment/star/{sample}_{type}.ReadsPerGene.out.tab"
 
 pbrun_fq2bam:
   container: "docker://nvcr.io/nvidia/clara/clara-parabricks:4.0.0-1"
diff --git a/config/multiqc_dna_config.yaml b/config/multiqc_dna_config.yaml
index 534bb35..94130a0 100644
--- a/config/multiqc_dna_config.yaml
+++ b/config/multiqc_dna_config.yaml
@@ -3,23 +3,15 @@ extra_fn_clean_exts: ##from this until end
     - '.duplication_metrics'
     - type: regex
       pattern: '_fastq[12]'
-extra_fn_clean_trim:
-  - 'Sample_WA-3560_'
 
 report_header_info:
   - Contact E-mail: "igp-klinsek-bioinfo@lists.uu.se"
   - Application Type: "TrueSeq PCR Free WGS"
   - Project Type: "Whole Genome"
- 
-custom content:
-  order:
-    - fastqc
-    - mosdepth
-    - fastp
-    - peddy
-    - samtools
-    - picard
 
+sp:
+  dna_number_table:
+    fn: "*DNA_number.table.tsv"
 
 custom_data:
   dna_number_table:
@@ -32,10 +24,30 @@ custom_data:
           title: "DNA number"
           description: "DNA number based on SampleSheet"
 
-sp:
-  dna_number_table:
-    fn: "*DNA_number.table.tsv"
-
+custom_table_header_config:
+  general_stats_table:
+    raw_total_sequences:
+      suffix: ""
+      title: "Total seqs M"
+    reads_mapped:
+      suffix: ""
+      title: "Reads mapped M"
+    reads_mapped_percent:
+      suffix: ""
+    reads_properly_paired_percent:
+      suffix: ""
+    median_coverage:
+      suffix: ""
+    10_x_pc:
+      suffix: ""
+    30_x_pc:
+      suffix: ""
+    50_x_pc:
+      suffix: ""
+    PERCENT_DUPLICATION:
+      suffix: ""
+    summed_mean:
+      suffix: ""
 
 table_columns_visible:
   FastQC:
@@ -45,7 +57,7 @@ table_columns_visible:
     percent_fails: False
     total_sequences: False
   fastp:
-    pct_adapter: True
+    pct_adapter: False
     pct_surviving: False
     after_filtering_gc_content: False
     filtering_result_passed_filter_reads: False
@@ -68,26 +80,30 @@ table_columns_visible:
     sex_het_ratio: False
     error_sex_check: True
     predicted_sex_sex_check: True
-  Picard:
-    PCT_PF_READS_ALIGNED: False
+  "Picard: HsMetrics":
+    FOLD_ENRICHMENT: False
+    MEDIAN_TARGET_COVERAGE: False
+    PCT_TARGET_BASES_30X: False
+    ZERO_CVG_TARGETS_PCT: Falses
+  "Picard: InsertSizeMetrics":
     summed_median: False
     summed_mean: True
+  "Picard: Mark Duplicates":
     PERCENT_DUPLICATION: True
+  "Picard: WgsMetrics":
     MEDIAN_COVERAGE: False
     MEAN_COVERAGE: False
     SD_COVERAGE: False
     PCT_30X: False
-    PCT_TARGET_BASES_30X: False
-    FOLD_ENRICHMENT: False
-    TOTAL_READS: True
-  Samtools:
+
+  "Samtools: stats":
     error_rate: False
     non-primary_alignments: False
     reads_mapped: False
     reads_mapped_percent: True
     reads_properly_paired_percent: True
     reads_MQ0_percent: False
-    raw_total_sequences: True
+    raw_total_sequences: True #tidigare from picard
 
 # mosdepth custom thresholds
 mosdepth_config:
@@ -100,9 +116,9 @@ mosdepth_config:
     - 30
     - 50
 
-# Patriks plug in, addera egna columner till general stats
+# Custom columns to general stats
 multiqc_cgs:
-  Picard:
+  "Picard: HsMetrics":
     FOLD_80_BASE_PENALTY:
       title: "Fold80"
       description: "Fold80 penalty from picard hs metrics"
@@ -121,25 +137,17 @@ multiqc_cgs:
       max: 100
       scale: "RdYlGn-rev"
       format: "{:.2%}"
-  Samtools:
+  "Samtools: stats":
     average_quality:
       title: "Average Quality"
       description: "Ratio between the sum of base qualities and total length from Samtools stats"
       min: 0
       max: 60
       scale: "RdYlGn"
-  mosdepth:
-     20_x_pc: #Cant get it to work
-        title: "20x percent"
-        description: "Fraction of genome with at least 20X coverage"
-        max: 100
-        min: 0
-        suffix: "%"
-        scale: "RdYlGn"
 
 # Galler alla kolumner oberoende pa module!
 table_columns_placement:
-  dna_number_table:
+  "Custom content: dna_number_table":
     dna_number: 300
   mosdepth:
     median_coverage: 601
@@ -150,7 +158,7 @@ table_columns_placement:
     20_x_pc: 604
     30_x_pc: 605
     50_x_pc: 606
-  Samtools:
+  "Samtools: stats":
     raw_total_sequences: 500
     reads_mapped: 501
     reads_mapped_percent: 502
@@ -166,19 +174,15 @@ table_columns_placement:
     error_sex_check: 701
     predicted_sex_sex_check: 702
     family_id: 703
-  Picard:
-    TOTAL_READS: 500
-    PCT_SELECTED_BASES: 801
-    FOLD_80_BASE_PENALTY: 802
-    PCT_PF_READS_ALIGNED: 888
-    summed_median: 888
-    PERCENT_DUPLICATION: 803
-    summed_mean: 804
-    STANDARD_DEVIATION: 805
-    ZERO_CVG_TARGETS_PCT: 888
-    MEDIAN_COVERAGE: 888
-    MEAN_COVERAGE: 888
-    SD_COVERAGE: 888
-    PCT_30X: 888
-    PCT_TARGET_BASES_30X: 888
+  "Picard: HsMetrics":
     FOLD_ENRICHMENT: 888
+    MEDIAN_TARGET_COVERAGE: 888
+    PCT_TARGET_BASES_30X: 888
+    FOLD_80_BASE_PENALTY: 801
+    PCT_SELECTED_BASES: 800
+    ZERO_CVG_TARGETS_PCT: 805
+  "Picard: InsertSizeMetrics":
+    summed_median: 803
+    summed_mean: 803
+  "Picard: Mark Duplicates":
+    PERCENT_DUPLICATION: 802
\ No newline at end of file
diff --git a/config/multiqc_rna_config.yaml b/config/multiqc_rna_config.yaml
index f298f48..407c9fe 100644
--- a/config/multiqc_rna_config.yaml
+++ b/config/multiqc_rna_config.yaml
@@ -2,22 +2,27 @@ decimalPoint_format: ','
 extra_fn_clean_exts: ##from this until end
     - '.duplication_metrics'
     - type: regex
-      pattern: '_fastq[12]'
-extra_fn_clean_trim:
-  - 'Sample_WA-3560_'
+      pattern: '^IHT[0-9]+-WP2_'
+    - type: regex
+      pattern: "_fastq[12]"
+
+table_sample_merge:
+  "R1": "_R1_001"
+  "R2": "_R2_001"
+  "L008":
+    - type: "regex"
+      pattern: "S[0-9]{1,2}_L008"
+
+
 
 report_header_info:
   - Contact E-mail: "igp-klinsek-bioinfo@lists.uu.se"
   - Application Type: "Whole Transcriptome Sequencing"
   - Project Type: "WTS"
 
-custom content:
-  order:
-    - mosdepth
-    - rseqc
-    - sortmerna
-    - fastp
-    - fastqc
+sp:
+  rna_number_table:
+    fn: "*RNA_number.table.tsv"
 
 custom_data:
   rna_number_table:
@@ -31,10 +36,32 @@ custom_data:
           description: "RNA number based on SampleSheet"
           placement: 300
 
-sp:
-  rna_number_table:
-    fn: "*RNA_number.table.tsv"
-
+custom_table_header_config:
+  general_stats_table:
+    reads_mapped:
+      suffix: ""
+      title: "Reads mapped M"
+    reads_mapped_percent:
+      suffix: ""
+    reads_properly_paired_percent:
+      suffix: ""
+    median_coverage:
+      title: "Median Coverage"
+      suffix: ""
+    10_x_pc:
+      suffix: ""
+    30_x_pc:
+      suffix: ""
+    50_x_pc:
+      suffix: ""
+    PERCENT_DUPLICATION:
+      suffix: ""
+    summed_mean:
+      suffix: ""
+    PCT_SELECTED_BASES:
+      suffix: ""
+    ZERO_CVG_TARGETS_PCT:
+      suffix: ""
 
 table_columns_visible:
   FastQC:
@@ -42,14 +69,14 @@ table_columns_visible:
     percent_gc: False
     avg_sequence_length: False
     percent_fails: False
-    total_sequences: False
+    total_sequences: True
   fastp:
     pct_adapter: False
     pct_surviving: False
     after_filtering_gc_content: False
     filtering_result_passed_filter_reads: False
-    after_filtering_q30_bases: False
-    after_filtering_q30_rate: False
+    after_filtering_q30_bases: True
+    after_filtering_q30_rate: True
     pct_duplication: False
   mosdepth:
     median_coverage: True
@@ -60,16 +87,35 @@ table_columns_visible:
     20_x_pc: False
     30_x_pc: False
     50_x_pc: False
+  STAR:
+    star-total_reads: True
+    star-mapped: False
+    star-mapped_percent: True
+    star-uniquely_mapped: False
+    star-uniquely_mapped_percent: False
+    star-multimapped: False
   SortMeRNA:
     rRNA_pct: True
-  Picard:
-    PCT_PF_READS_ALIGNED: True
+  # Picard:
+  #   PCT_PF_READS_ALIGNED: True
+  "Picard: Mark Duplicates":
     PERCENT_DUPLICATION: True
-  Samtools:
-    error_rate: False
-    non-primary_alignments: False
-    reads_mapped: True
-    reads_mapped_percent: True
-    reads_properly_paired_percent: True
-    reads_MQ0_percent: False
-    raw_total_sequences: True
+
+
+# mosdepth custom thresholds
+mosdepth_config:
+  general_stats_coverage:
+    - 1
+    - 5
+    - 10
+    - 15
+    - 20
+    - 30
+    - 50
+
+# multiqc_cgs:
+#   "Picard: Alignment Summary":
+#     MEAN_READ_LENGTH:
+#       title: "Mean Read Length"
+#       description: "Mean read length from Picard Alignment Summary Metrics"
+#       format: "{:.1f}"
\ No newline at end of file
diff --git a/workflow/Snakefile b/workflow/Snakefile
index cc56c70..1ab64f2 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -179,7 +179,13 @@ module alignment:
         config
 
 
-use rule star from alignment as alignment_star
+use rule star from alignment as alignment_star with:
+    output:
+        bam=temp("alignment/star/{sample}_{type}.bam"),
+        sj=temp("alignment/star/{sample}_{type}.SJ.out.tab"),
+        reads_per_gene=temp("alignment/star/{sample}_{type}.ReadsPerGene.out.tab"),
+        log_final=temp("alignment/star/{sample}_{type}.Log.final.out"),
+
 
 
 use rule samtools_index from alignment as alignment_samtools_index
diff --git a/workflow/scripts/sample_order_multiqc.py b/workflow/scripts/sample_order_multiqc.py
index 6feb337..8b084d1 100644
--- a/workflow/scripts/sample_order_multiqc.py
+++ b/workflow/scripts/sample_order_multiqc.py
@@ -10,7 +10,7 @@
 sample_order_index = ["sample", "s_index", "lab_id", "type"]
 for sample, type, fastq_path in snakemake.params.filelist:
     fastq = fastq_path.split("/")[-1]
-    lab_id = fastq.split("_")[0]
+    lab_id = fastq.split("_")[1]
     s_pattern = re.compile("_S([0-9]+)_")
     # In case of missing S-index in fastq1-filename set s_index to 99 (last)
     try:

From fe942e1db279442d6db84a6e09d25da42850420a Mon Sep 17 00:00:00 2001
From: Arielle R Munters <arielle@reivant.se>
Date: Wed, 16 Oct 2024 15:11:21 +0200
Subject: [PATCH 2/7] feat: update multiqc version

---
 config/multiqc_dna_config.yaml | 62 ++++++++++++++++++++++++++--------
 config/multiqc_rna_config.yaml | 58 ++++++++++++++++++-------------
 2 files changed, 81 insertions(+), 39 deletions(-)

diff --git a/config/multiqc_dna_config.yaml b/config/multiqc_dna_config.yaml
index 94130a0..adf50b8 100644
--- a/config/multiqc_dna_config.yaml
+++ b/config/multiqc_dna_config.yaml
@@ -1,9 +1,39 @@
 decimalPoint_format: ','
 extra_fn_clean_exts: ##from this until end
     - '.duplication_metrics'
+    - type: regex
+      pattern: '^HG[0-9]+-[A-Za-z0-9-]+_'
     - type: regex
       pattern: '_fastq[12]'
 
+table_sample_merge:
+  "R1": "_R1_001"
+  "R2": "_R2_001"
+  "L008": # to remove SX_L00X from ending to enable grouping 
+    - type: "regex"
+      pattern: "_S[0-9]{1,2}_L008"
+  "L007": # to remove SX_L00X from ending to enable grouping 
+    - type: "regex"
+      pattern: "_S[0-9]{1,2}_L007"
+  "L006": # to remove SX_L00X from ending to enable grouping 
+    - type: "regex"
+      pattern: "_S[0-9]{1,2}_L006"
+  "L005": # to remove SX_L00X from ending to enable grouping 
+    - type: "regex"
+      pattern: "_[0-9]{1,2}_L005"
+  "L004": # to remove SX_L00X from ending to enable grouping 
+    - type: "regex"
+      pattern: "_S[0-9]{1,2}_L004"
+  "L003": # to remove SX_L00X from ending to enable grouping 
+    - type: "regex"
+      pattern: "_S[0-9]{1,2}_L003"
+  "L002": # to remove SX_L00X from ending to enable grouping 
+    - type: "regex"
+      pattern: "_S[0-9]{1,2}_L002"
+  "L001": # to remove SX_L00X from ending to enable grouping 
+    - type: "regex"
+      pattern: "_S[0-9]{1,2}_L001"
+
 report_header_info:
   - Contact E-mail: "igp-klinsek-bioinfo@lists.uu.se"
   - Application Type: "TrueSeq PCR Free WGS"
@@ -24,14 +54,26 @@ custom_data:
           title: "DNA number"
           description: "DNA number based on SampleSheet"
 
+# mosdepth custom thresholds
+mosdepth_config:
+  general_stats_coverage:
+    - 1
+    - 5
+    - 10
+    - 15
+    - 20
+    - 30
+    - 50
+
+# Remove suffix in general stats
 custom_table_header_config:
   general_stats_table:
     raw_total_sequences:
       suffix: ""
-      title: "Total seqs M"
+      title: "Total seqs [M]"
     reads_mapped:
       suffix: ""
-      title: "Reads mapped M"
+      title: "Reads mapped [M]"
     reads_mapped_percent:
       suffix: ""
     reads_properly_paired_percent:
@@ -46,9 +88,11 @@ custom_table_header_config:
       suffix: ""
     PERCENT_DUPLICATION:
       suffix: ""
+      title: "Duplication [%]"
     summed_mean:
       suffix: ""
 
+# General stats column visibility
 table_columns_visible:
   FastQC:
     percent_duplicates: False
@@ -84,7 +128,7 @@ table_columns_visible:
     FOLD_ENRICHMENT: False
     MEDIAN_TARGET_COVERAGE: False
     PCT_TARGET_BASES_30X: False
-    ZERO_CVG_TARGETS_PCT: Falses
+    ZERO_CVG_TARGETS_PCT: False
   "Picard: InsertSizeMetrics":
     summed_median: False
     summed_mean: True
@@ -95,7 +139,6 @@ table_columns_visible:
     MEAN_COVERAGE: False
     SD_COVERAGE: False
     PCT_30X: False
-
   "Samtools: stats":
     error_rate: False
     non-primary_alignments: False
@@ -105,17 +148,6 @@ table_columns_visible:
     reads_MQ0_percent: False
     raw_total_sequences: True #tidigare from picard
 
-# mosdepth custom thresholds
-mosdepth_config:
-  general_stats_coverage:
-    - 1
-    - 5
-    - 10
-    - 15
-    - 20
-    - 30
-    - 50
-
 # Custom columns to general stats
 multiqc_cgs:
   "Picard: HsMetrics":
diff --git a/config/multiqc_rna_config.yaml b/config/multiqc_rna_config.yaml
index 407c9fe..d0374b7 100644
--- a/config/multiqc_rna_config.yaml
+++ b/config/multiqc_rna_config.yaml
@@ -9,12 +9,10 @@ extra_fn_clean_exts: ##from this until end
 table_sample_merge:
   "R1": "_R1_001"
   "R2": "_R2_001"
-  "L008":
+  "L008": # to remove SX_L00X from ending to enable grouping 
     - type: "regex"
       pattern: "S[0-9]{1,2}_L008"
 
-
-
 report_header_info:
   - Contact E-mail: "igp-klinsek-bioinfo@lists.uu.se"
   - Application Type: "Whole Transcriptome Sequencing"
@@ -36,13 +34,20 @@ custom_data:
           description: "RNA number based on SampleSheet"
           placement: 300
 
+# mosdepth custom thresholds
+mosdepth_config:
+  general_stats_coverage:
+    - 1
+    - 5
+    - 10
+    - 15
+    - 20
+    - 30
+    - 50
+
+# Remove suffix from General stats columns
 custom_table_header_config:
   general_stats_table:
-    reads_mapped:
-      suffix: ""
-      title: "Reads mapped M"
-    reads_mapped_percent:
-      suffix: ""
     reads_properly_paired_percent:
       suffix: ""
     median_coverage:
@@ -62,21 +67,37 @@ custom_table_header_config:
       suffix: ""
     ZERO_CVG_TARGETS_PCT:
       suffix: ""
+    star-total_reads:
+      suffix: ""
+      title: "Total readpairs [M]"
+    star-mapped_percent:
+      suffix: ""
+      title: "Aligned [%]"
+    star-uniquely_mapped_percent:
+      suffix: ""
+      title: "Uniq aligned [%]"
+    star-multimapped:
+      suffix: ""
+      title: "Multimapped [M]"
+    sortmerna-rRNA_pct:
+      suffix: ""
+      title: "rRNA [%]"
 
+# General stats column visibility
 table_columns_visible:
   FastQC:
     percent_duplicates: False
     percent_gc: False
     avg_sequence_length: False
     percent_fails: False
-    total_sequences: True
+    total_sequences: False
   fastp:
     pct_adapter: False
     pct_surviving: False
     after_filtering_gc_content: False
     filtering_result_passed_filter_reads: False
-    after_filtering_q30_bases: True
-    after_filtering_q30_rate: True
+    after_filtering_q30_bases: False
+    after_filtering_q30_rate: False
     pct_duplication: False
   mosdepth:
     median_coverage: True
@@ -92,8 +113,8 @@ table_columns_visible:
     star-mapped: False
     star-mapped_percent: True
     star-uniquely_mapped: False
-    star-uniquely_mapped_percent: False
-    star-multimapped: False
+    star-uniquely_mapped_percent: True
+    star-multimapped: True
   SortMeRNA:
     rRNA_pct: True
   # Picard:
@@ -102,17 +123,6 @@ table_columns_visible:
     PERCENT_DUPLICATION: True
 
 
-# mosdepth custom thresholds
-mosdepth_config:
-  general_stats_coverage:
-    - 1
-    - 5
-    - 10
-    - 15
-    - 20
-    - 30
-    - 50
-
 # multiqc_cgs:
 #   "Picard: Alignment Summary":
 #     MEAN_READ_LENGTH:

From c65b1673ce413b2a2db6e21c9f86fbea3aca14e4 Mon Sep 17 00:00:00 2001
From: Arielle R Munters <arielle@reivant.se>
Date: Thu, 17 Oct 2024 11:14:38 +0200
Subject: [PATCH 3/7] chore: add versions to multiqc and Results-folder

---
 workflow/rules/common.smk | 42 +++++++++++++++++++++++++++++++++++++--
 1 file changed, 40 insertions(+), 2 deletions(-)

diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index e006eaf..05fb024 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -3,15 +3,33 @@ __copyright__ = "Copyright 2022, Martin Rippin"
 __email__ = "arielle.munters@scilifelab.uu.se, nina.hollfelder@scilifelab.uu.se"
 __license__ = "GPL-3"
 
+
+import itertools
+import numpy as np
 import pandas as pd
-from snakemake.io import Wildcards
-from typing import List, Union
+import pathlib
+import re
 from snakemake.utils import validate
 from snakemake.utils import min_version
+import yaml
+from datetime import datetime
 
 from hydra_genetics.utils.resources import load_resources
 from hydra_genetics.utils.samples import *
 from hydra_genetics.utils.units import *
+from hydra_genetics import min_version as hydra_min_version
+
+from hydra_genetics.utils.misc import replace_dict_variables
+from hydra_genetics.utils.misc import export_config_as_file
+from hydra_genetics.utils.software_versions import add_version_files_to_multiqc
+from hydra_genetics.utils.software_versions import add_software_version_to_config
+from hydra_genetics.utils.software_versions import export_pipeline_version_as_file
+from hydra_genetics.utils.software_versions import export_software_version_as_file
+from hydra_genetics.utils.software_versions import get_pipeline_version
+from hydra_genetics.utils.software_versions import use_container
+from hydra_genetics.utils.software_versions import touch_software_version_file
+from hydra_genetics.utils.software_versions import touch_pipeline_version_file_name
+
 
 min_version("7.8.0")
 
@@ -40,6 +58,26 @@ config = load_resources(config, config["resources"])
 validate(config, schema="../schemas/resources.schema.yaml")
 
 
+## get version information on pipeline, containers and software
+pipeline_name = "fluffy_hematology_wgs"
+pipeline_version = get_pipeline_version(workflow, pipeline_name=pipeline_name)
+version_files = touch_pipeline_version_file_name(
+    pipeline_version, date_string=pipeline_name, directory="Results/versions/software"
+)
+if use_container(workflow):
+    version_files.append(touch_software_version_file(config, date_string=pipeline_name, directory="Results/versions/software"))
+add_version_files_to_multiqc(config, version_files)
+
+
+onstart:
+    export_pipeline_version_as_file(pipeline_version, date_string=pipeline_name, directory="Results/versions/software")
+    if use_container(workflow):
+        update_config, software_info = add_software_version_to_config(config, workflow, False)
+        export_software_version_as_file(software_info, date_string=pipeline_name, directory="Results/versions/software")
+    date_string = datetime.now().strftime("%Y%m%d")
+    export_config_as_file(update_config, date_string=date_string, directory="Results/versions")
+
+
 ### Read and validate samples file
 samples = pd.read_table(config["samples"], dtype=str).set_index("sample", drop=False)
 validate(samples, schema="../schemas/samples.schema.yaml")

From ab1ab54e7a2217f699122f5065d4ad9bbe9884f2 Mon Sep 17 00:00:00 2001
From: Arielle R Munters <arielle@reivant.se>
Date: Mon, 21 Oct 2024 15:41:08 +0200
Subject: [PATCH 4/7] chore: update output_files to yaml

---
 .tests/integration/config.yaml               |   2 +-
 .tests/integration/input/HD829-T_S3_R1.fq.gz |   0
 .tests/integration/input/HD829-T_S3_R2.fq.gz |   0
 .tests/integration/samples.tsv               |   3 +-
 .tests/integration/units.tsv                 |   1 +
 config/config.yaml                           |   2 +-
 config/output_files.json                     |  84 -----
 config/output_files.yaml                     | 374 +++++++++++++++++++
 workflow/Snakefile                           |  71 ++--
 workflow/rules/common.smk                    | 114 +++---
 10 files changed, 480 insertions(+), 171 deletions(-)
 create mode 100644 .tests/integration/input/HD829-T_S3_R1.fq.gz
 create mode 100644 .tests/integration/input/HD829-T_S3_R2.fq.gz
 delete mode 100644 config/output_files.json
 create mode 100644 config/output_files.yaml

diff --git a/.tests/integration/config.yaml b/.tests/integration/config.yaml
index dac8efe..a1f2479 100644
--- a/.tests/integration/config.yaml
+++ b/.tests/integration/config.yaml
@@ -1,4 +1,4 @@
-output: "../../config/output_files.json"
+output: "../../config/output_files.yaml"
 
 reference:
   design_bed: "reference/homo_sapiens.wgs.bed"
diff --git a/.tests/integration/input/HD829-T_S3_R1.fq.gz b/.tests/integration/input/HD829-T_S3_R1.fq.gz
new file mode 100644
index 0000000..e69de29
diff --git a/.tests/integration/input/HD829-T_S3_R2.fq.gz b/.tests/integration/input/HD829-T_S3_R2.fq.gz
new file mode 100644
index 0000000..e69de29
diff --git a/.tests/integration/samples.tsv b/.tests/integration/samples.tsv
index f8cae6f..1d29cb7 100644
--- a/.tests/integration/samples.tsv
+++ b/.tests/integration/samples.tsv
@@ -1,2 +1,3 @@
 sample	tumor_content	sex
-NA12878	1.0	O
+NA12878	1.0	K
+HD829	1.0	O
diff --git a/.tests/integration/units.tsv b/.tests/integration/units.tsv
index 70eaa15..60b5267 100644
--- a/.tests/integration/units.tsv
+++ b/.tests/integration/units.tsv
@@ -2,3 +2,4 @@ sample	type	platform	machine	flowcell	lane	barcode	fastq1	fastq2	adapter
 NA12878	T	illumina	NovaSeq	1FLOWCELL	L001	ACGGAACA	input/NA12878-T_S2_R1.fq.gz	input/NA12878-T_S2_R2.fq.gz	AAAA,TTTT
 NA12878	N	illumina	NovaSeq	1FLOWCELL	L001	ACGGAACA	input/NA12878-N_S1_R1.fq.gz	input/NA12878-N_S1_R2.fq.gz	CCCC,GGGG
 NA12878	R	illumina	NovaSeq	1FLOWCELL	L001	ACGGAACA	input/NA12878-R_S10_R1.fq.gz	input/NA12878-R_S10_R2.fq.gz	CCCC,GGGG
+HD829	T	illumina	NovaSeq	1FLOWCELL	L001	ACGGAACA	input/HD829-T_S3_R1.fq.gz	input/HD829-T_S3_R2.fq.gz	AAAA,TTTT
diff --git a/config/config.yaml b/config/config.yaml
index 02c0c41..51853f5 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -1,7 +1,7 @@
 resources: "resources.yaml"
 samples: "samples.tsv"
 units: "units.tsv"
-output: "config/output_files.json"
+output: "config/output_files.yaml"
 
 aligner: "bwa_gpu" # bwa_gpu or bwa_sentieon
 
diff --git a/config/output_files.json b/config/output_files.json
deleted file mode 100644
index e5d9ab9..0000000
--- a/config/output_files.json
+++ /dev/null
@@ -1,84 +0,0 @@
-{
-  "Results/MultiQC_TN.html": {"name": "_results_multiqc_dna", "file": "qc/multiqc/multiqc_DNA.html",  "types": ["T", "N"]},
-  "Results/MultiQC_R.html": {"name": "_results_multiqc_rna", "file": "qc/multiqc/multiqc_RNA.html",  "types": ["R"]},
-
-  "Results/{sample}/DNA_fusions/{sample}_T.dux4_igh_read_count.txt": {"name": "_results_dux4_igh", "file": "reports/dux_read_counts/{sample}_T.dux4_igh.txt", "types": ["T"]},
-  "Results/{sample}/DNA_fusions/{sample}_T.dux4_erg_read_count.txt": {"name": "_results_dux4_erg", "file": "reports/dux_read_counts/{sample}_T.dux4_erg.txt", "types": ["T"]},
-
-  "Results/{sample}/RNA_fusions/{sample}_R.arriba.tsv": {"name": "_results_arriba", "file": "fusions/arriba/{sample}_R.fusions.tsv", "types": ["R"]},
-  "Results/{sample}/RNA_fusions/{sample}_R.arriba.plot.pdf": {"name": "_results_arriba_draw_fusion", "file": "fusions/arriba_draw_fusion/{sample}_R.pdf", "types": ["R"]},
-  "Results/{sample}/RNA_fusions/{sample}_R.star_fusion.tsv": {"name": "_results_star_fusion", "file": "fusions/star_fusion/{sample}_R/star-fusion.fusion_predictions.tsv", "types": ["R"]},
-  "Results/{sample}/RNA_fusions/{sample}_R.fusioncatcher.tsv": {"name": "_results_fusioncatcher", "file": "fusions/fusioncatcher/{sample}_R/final-list_candidate-fusion-genes.txt", "types": ["R"]},
-  "Results/{sample}/Cram/{sample}_R.bam": {"name": "_results_star_cram", "file": "alignment/star/{sample}_R.bam", "types": ["R"]},
-  "Results/{sample}/Cram/{sample}_R.bam.bai": {"name": "_results_star_crai", "file": "alignment/star/{sample}_R.bam.bai", "types": ["R"]},
-  "Results/{sample}/RNA_fusions/{sample}_R.fusioncatcher_dux4-igh_counts.txt": {"name": "_results_fusioncatcher_dux4_igh_counts", "file": "fusions/fusioncatcher/{sample}_R/dux4-igh_counts.txt", "types": ["R"]},
-  "Results/{sample}/RNA_fusions/{sample}_R.fusioncatcher_dux4-igh_filtered-results.txt": {"name": "_results_fusioncatcher_dux4_igh_hits", "file": "fusions/fusioncatcher/{sample}_R/dux4-igh_hits.txt", "types": ["R"]},
-  
-  "Results/{sample}/Cram/{sample}_{type}.crumble.cram": {"name": "_results_cram", "file": "compression/crumble/{sample}_{type}.crumble.cram", "types": ["T", "N"]},
-  "Results/{sample}/Cram/{sample}_{type}.crumble.cram.crai": {"name": "_results_crai", "file": "compression/crumble/{sample}_{type}.crumble.cram.crai", "types": ["T", "N"]},
-  "Results/{sample}/SNV_indels/{sample}_T.vep.vcf.gz": {"name": "_results_vcf_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.vep.vcf.gz", "types": ["T"]},
-  "Results/{sample}/SNV_indels/{sample}_T.vep.vcf.gz.tbi": {"name": "_results_tbi_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.vep.vcf.gz.tbi", "types": ["T"]},
-  "Results/{sample}/SNV_indels/{sample}_T.vep.all.vcf.gz": {"name": "_results_vcf_all_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.vep.include.all.vcf.gz", "types": ["T"]},
-  "Results/{sample}/SNV_indels/{sample}_T.vep.all.vcf.gz.tbi": {"name": "_results_tbi_all_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.vep.include.all.vcf.gz.tbi", "types": ["T"]},
-  "Results/{sample}/SNV_indels/{sample}_T.vep.aml.vcf.gz": {"name": "_results_vcf_aml_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.vep.include.aml.vcf.gz", "types": ["T"]},
-  "Results/{sample}/SNV_indels/{sample}_T.vep.aml.vcf.gz.tbi": {"name": "_results_tbi_aml_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.vep.include.aml.vcf.gz.tbi", "types": ["T"]},
-  "Results/{sample}/SNV_indels/{sample}_T.vep.tm.vcf.gz": {"name": "_results_vcf_tm_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.vep.include.tm.vcf.gz", "types": ["T"]},
-  "Results/{sample}/SNV_indels/{sample}_T.vep.tm.vcf.gz.tbi": {"name": "_results_tbi_tm_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.vep.include.tm.vcf.gz.tbi", "types": ["T"]},
-  "Results/{sample}/SNV_indels/{sample}_T.xlsx": {"name": "_results_xlsx_t", "file": "export_to_xlsx/t/{sample}_T.snvs.xlsx", "types": ["T"]},
-
-  "Results/{sample}/SNV_indels/{sample}_TN.vep.vcf.gz": {"name": "_results_vcf_tn", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.vep.vcf.gz", "types": ["TN"]},
-  "Results/{sample}/SNV_indels/{sample}_TN.vep.vcf.gz.tbi": {"name": "_results_tbi_tn", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.vep.vcf.gz.tbi", "types": ["TN"]},
-  "Results/{sample}/SNV_indels/{sample}_TN.vep.all.vcf.gz": {"name": "_results_vcf_all", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.vep.include.all.vcf.gz", "types": ["TN"]},
-  "Results/{sample}/SNV_indels/{sample}_TN.vep.all.vcf.gz.tbi": {"name": "_results_tbi_all", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.vep.include.all.vcf.gz.tbi", "types": ["TN"]},
-  "Results/{sample}/SNV_indels/{sample}_TN.vep.aml.vcf.gz": {"name": "_results_vcf_aml", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.vep.include.aml.vcf.gz", "types": ["TN"]},
-  "Results/{sample}/SNV_indels/{sample}_TN.vep.aml.vcf.gz.tbi": {"name": "_results_tbi_aml", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.vep.include.aml.vcf.gz.tbi", "types": ["TN"]},
-  "Results/{sample}/SNV_indels/{sample}_TN.vep.tm.vcf.gz": {"name": "_results_vcf_tm", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.vep.include.tm.vcf.gz", "types": ["TN"]},
-  "Results/{sample}/SNV_indels/{sample}_TN.vep.tm.vcf.gz.tbi": {"name": "_results_tbi_tm", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.vep.include.tm.vcf.gz.tbi", "types": ["TN"]},
-  "Results/{sample}/SNV_indels/{sample}_TN.xlsx": {"name": "_results_xlsx_tn", "file": "export_to_xlsx/tn/{sample}.snvs.xlsx", "types": ["TN"]},
-  
-  "Results/{sample}/SNV_indels/{sample}_mutectcaller_T.all.tsv": {"name": "_results_tsv_all_t", "file": "tsv_files/{sample}_mutectcaller_t.all.tsv", "types": ["T"]},
-  "Results/{sample}/SNV_indels/{sample}_mutectcaller_T.aml.tsv": {"name": "_results_tsv_aml_t", "file": "tsv_files/{sample}_mutectcaller_t.aml.tsv", "types": ["T"]},
-
-  "Results/{sample}/SNV_indels/{sample}_mutectcaller_TN.all.tsv": {"name": "_results_tsv_all", "file": "tsv_files/{sample}_mutectcaller_tn.all.tsv", "types": ["TN"]},
-  "Results/{sample}/SNV_indels/{sample}_mutectcaller_TN.aml.tsv": {"name": "_results_tsv_aml", "file": "tsv_files/{sample}_mutectcaller_tn.aml.tsv", "types": ["TN"]},
-  "Results/{sample}/SNV_indels/{sample}.pindel.vcf": {"name": "_results_pindel_vcf", "file": "cnv_sv/pindel_vcf/{sample}_T.no_tc.vcf", "types": ["T"]},
-  "Results/{sample}/CNV/{sample}_T.pathology.vcf.gz": {"name": "_results_cnvkit_vcf", "file": "cnv_sv/cnvkit_vcf/{sample}_T.pathology.vcf.gz", "types": ["T"]},
-  "Results/{sample}/CNV/{sample}_T.pathology.vcf.gz.tbi": {"name": "_results_cnvkit_tbi", "file": "cnv_sv/cnvkit_vcf/{sample}_T.pathology.vcf.gz.tbi", "types": ["T"]},
-  "Results/{sample}/CNV/{sample}_T.CNV.xlsx": {"name": "_results_cnvkit_table", "file": "cnv_sv/cnvkit_table/{sample}_T.CNV.xlsx", "types": ["T"]},
-  "Results/{sample}/CNV/{sample}_T.png": {"name": "_results_cnvkit_scatter_whole", "file": "cnv_sv/cnvkit_scatter/{sample}_T.png", "types": ["T"]},
-  "Results/{sample}/CNV/{sample}_T_chr{chr}.png": {"name": "_results_cnvkit_scatter", "file": "cnv_sv/cnvkit_scatter/{sample}_T_chr{chr}.png", "types": ["T"]},
-  "Results/{sample}/CNV/GATK/{sample}_T.pathology.vcf.gz": {"name": "_results_gatk_vcf", "file": "cnv_sv/gatk_vcf/{sample}_T.pathology.vcf.gz", "types": ["T"]},
-  "Results/{sample}/CNV/GATK/{sample}_T.pathology.vcf.gz.tbi": {"name": "_results_gatk_vcf_tbi", "file": "cnv_sv/gatk_vcf/{sample}_T.pathology.vcf.gz.tbi", "types": ["T"]},
-  "Results/{sample}/CNV/cnv_html_report/{sample}_T.pathology.chr{chr}.cnv_report.html": {"name": "_results_cnv_html_report_chr", "file": "reports/cnv_html_report/{sample}_T.pathology.chr{chr}.cnv_report.html", "types": ["T"]},
-
-  "Results/{sample}/SV/{sample}_manta_T.ssa.vcf.gz": {"name": "_results_manta_t_vcf", "file": "cnv_sv/manta_run_workflow_t/{sample}.ssa.vcf.gz", "types": ["T"]},
-  "Results/{sample}/SV/{sample}_manta_T.ssa.vcf.gz.tbi": {"name": "_results_manta_t_tbi", "file": "cnv_sv/manta_run_workflow_t/{sample}.ssa.vcf.gz.tbi", "types": ["T"]},
-  "Results/{sample}/SV/{sample}_manta_T.ssa.all.vcf.gz": {"name": "_results_manta_t_vcf_all", "file": "cnv_sv/manta_run_workflow_t/{sample}.ssa.include.all.vcf.gz", "types": ["T"]},
-  "Results/{sample}/SV/{sample}_manta_T.ssa.all.vcf.gz.tbi": {"name": "_results_manta_t_tbi_all", "file": "cnv_sv/manta_run_workflow_t/{sample}.ssa.include.all.vcf.gz.tbi", "types": ["T"]},
-  "Results/{sample}/SV/{sample}_manta_T.ssa.aml.vcf.gz": {"name": "_results_manta_t_vcf_aml", "file": "cnv_sv/manta_run_workflow_t/{sample}.ssa.include.aml.vcf.gz", "types": ["T"]},
-  "Results/{sample}/SV/{sample}_manta_T.ssa.aml.vcf.gz.tbi": {"name": "_results_manta_t_tbi_aml", "file": "cnv_sv/manta_run_workflow_t/{sample}.ssa.include.aml.vcf.gz.tbi", "types": ["T"]},
-  "Results/{sample}/SV/{sample}_manta_T.tsv": {"name": "_results_manta_t_tsv", "file": "tsv_files/{sample}_manta_t.tsv", "types": ["T"]},
-  "Results/{sample}/SV/{sample}_manta_T.del.tsv": {"name": "_results_manta_t_tsv_del", "file": "tsv_files/{sample}_manta_t.del.tsv", "types": ["T"]},
-  "Results/{sample}/SV/{sample}_manta_T.ins.tsv": {"name": "_results_manta_t_tsv_ins", "file": "tsv_files/{sample}_manta_t.ins.tsv", "types": ["T"]},
-  "Results/{sample}/SV/{sample}_manta_T.dup.tsv": {"name": "_results_manta_t_tsv_dup", "file": "tsv_files/{sample}_manta_t.dup.tsv", "types": ["T"]},
-  "Results/{sample}/SV/{sample}_manta_T.bnd.tsv": {"name": "_results_manta_t_tsv_bnd", "file": "tsv_files/{sample}_manta_t.tsv", "types": ["T"]},
-  "Results/{sample}/SV/{sample}_manta_T.bnd.all.tsv": {"name": "_results_manta_t_tsv_bnd_all", "file": "tsv_files/{sample}_manta_t.all.tsv", "types": ["T"]},
-  "Results/{sample}/SV/{sample}_manta_T.bnd.aml.tsv": {"name": "_results_manta_t_tsv_bnd_aml", "file": "tsv_files/{sample}_manta_t.aml.tsv", "types": ["T"]},
-
-  "Results/{sample}/SV/{sample}_manta_TN.ssa.vcf.gz": {"name": "_results_manta_tn_vcf", "file": "cnv_sv/manta_run_workflow_tn/{sample}.ssa.vcf.gz", "types": ["TN"]},
-  "Results/{sample}/SV/{sample}_manta_TN.ssa.vcf.gz.tbi": {"name": "_results_manta_tn_tbi", "file": "cnv_sv/manta_run_workflow_tn/{sample}.ssa.vcf.gz.tbi", "types": ["TN"]},
-  "Results/{sample}/SV/{sample}_manta_TN.ssa.all.vcf.gz": {"name": "_results_manta_tn_vcf_all", "file": "cnv_sv/manta_run_workflow_tn/{sample}.ssa.include.all.vcf.gz", "types": ["TN"]},
-  "Results/{sample}/SV/{sample}_manta_TN.ssa.all.vcf.gz.tbi": {"name": "_results_manta_tn_tbi_all", "file": "cnv_sv/manta_run_workflow_tn/{sample}.ssa.include.all.vcf.gz.tbi", "types": ["TN"]},
-  "Results/{sample}/SV/{sample}_manta_TN.ssa.aml.vcf.gz": {"name": "_results_manta_tn_vcf_aml", "file": "cnv_sv/manta_run_workflow_tn/{sample}.ssa.include.aml.vcf.gz", "types": ["TN"]},
-  "Results/{sample}/SV/{sample}_manta_TN.ssa.aml.vcf.gz.tbi": {"name": "_results_manta_tn_tbi_aml", "file": "cnv_sv/manta_run_workflow_tn/{sample}.ssa.include.aml.vcf.gz.tbi", "types": ["TN"]},
-  "Results/{sample}/SV/{sample}_manta_TN.del.tsv": {"name": "_results_manta_tsv_del", "file": "tsv_files/{sample}_manta_tn.del.tsv", "types": ["TN"]},
-  "Results/{sample}/SV/{sample}_manta_TN.ins.tsv": {"name": "_results_manta_tsv_ins", "file": "tsv_files/{sample}_manta_tn.ins.tsv", "types": ["TN"]},
-  "Results/{sample}/SV/{sample}_manta_TN.dup.tsv": {"name": "_results_manta_tsv_dup", "file": "tsv_files/{sample}_manta_tn.dup.tsv", "types": ["TN"]},
-  "Results/{sample}/SV/{sample}_manta_TN.bnd.tsv": {"name": "_results_manta_tsv_bnd", "file": "tsv_files/{sample}_manta_tn.tsv", "types": ["TN"]},
-  "Results/{sample}/SV/{sample}_manta_TN.bnd.all.tsv": {"name": "_results_manta_tsv_bnd_all", "file": "tsv_files/{sample}_manta_tn.all.tsv", "types": ["TN"]},
-  "Results/{sample}/SV/{sample}_manta_TN.bnd.aml.tsv": {"name": "_results_manta_tsv_bnd_aml", "file": "tsv_files/{sample}_manta_tn.aml.tsv", "types": ["TN"]},
-  "Archive/{sample}_{flowcell}_{lane}_{barcode}_{type}.spring": {"name": "_archives_spring", "file": "compression/spring/{sample}_{flowcell}_{lane}_{barcode}_{type}.spring", "types": ["T", "N", "R"]},
-
-  "Results/{sample}/CNV/SVDB/{sample}_T.pathology.svdb_query.vcf.gz": {"name": "_results_svdb_vcf", "file": "cnv_sv/svdb_query/{sample}_T.pathology.svdb_query.vcf.gz", "types": ["T"]},
-  "Results/{sample}/CNV/SVDB/{sample}_T.pathology.svdb_query.vcf.gz.tbi": {"name": "_results_svdb_tbi", "file": "cnv_sv/svdb_query/{sample}_T.pathology.svdb_query.vcf.gz.tbi", "types": ["T"]}
-}
diff --git a/config/output_files.yaml b/config/output_files.yaml
new file mode 100644
index 0000000..a154e18
--- /dev/null
+++ b/config/output_files.yaml
@@ -0,0 +1,374 @@
+directory: ./Results
+
+files:
+  - name: Spring compression
+    input: "compression/spring/{sample}_{flowcell}_{lane}_{barcode}_{type}.spring"
+    output: Archive/{sample}_{flowcell}_{lane}_{barcode}_{type}.spring
+    types: ["T", "N", "R"]
+
+  - name: MultiQC DNA
+    input: "qc/multiqc/multiqc_DNA.html"
+    output: multiqc_TN.html
+    types: ["T", "N"]
+  
+  - name: Cram T and N
+    input: compression/crumble/{sample}_{type}.crumble.cram
+    output: "{sample}/Cram/{sample}_{type}.crumble.cram"
+    types: ["T", "N"]
+
+  - name: Crai T and N
+    input: compression/crumble/{sample}_{type}.crumble.cram.crai
+    output: "{sample}/Cram/{sample}_{type}.crumble.cram.crai"
+    types: ["T", "N"]
+
+  # Tumor
+    # SNV indels
+  - name: Parabricks T vcf
+    input: "parabricks/pbrun_mutectcaller_t/{sample}_T.vep.vcf.gz"
+    output: "{sample}/SNV_indels/{sample}_T.vep.vcf.gz"
+    types: ["T"]
+  
+  - name: Parabricks T tbi
+    input: "parabricks/pbrun_mutectcaller_t/{sample}_T.vep.vcf.gz.tbi"
+    output: "{sample}/SNV_indels/{sample}_T.vep.vcf.gz.tbi"
+    types: ["T"]
+
+  - name: Parabricks T vcf ALL subsample
+    input: "parabricks/pbrun_mutectcaller_t/{sample}_T.vep.include.all.vcf.gz"
+    output: "{sample}/SNV_indels/{sample}_T.vep.all.vcf.gz" 
+    types: ["T"]
+  
+  - name: Parabricks T tbi ALL subsample
+    input: "parabricks/pbrun_mutectcaller_t/{sample}_T.vep.include.all.vcf.gz.tbi"
+    output: "{sample}/SNV_indels/{sample}_T.vep.all.vcf.gz.tbi"
+    types: ["T"]
+
+  - name: Parabricks T tsv ALL subsample
+    input: tsv_files/{sample}_mutectcaller_t.all.tsv
+    output: "{sample}/SNV_indels/{sample}_mutectcaller_T.all.tsv"
+    types: ["T"]
+
+  - name: Parabricks T vcf AML subsample
+    input: "parabricks/pbrun_mutectcaller_t/{sample}_T.vep.include.aml.vcf.gz"
+    output: "{sample}/SNV_indels/{sample}_T.vep.aml.vcf.gz"
+    types: ["T"]
+
+  - name: Parabricks T tbi AML subsample
+    input: "parabricks/pbrun_mutectcaller_t/{sample}_T.vep.include.aml.vcf.gz.tbi"
+    output: "{sample}/SNV_indels/{sample}_T.vep.aml.vcf.gz.tbi"
+    types: ["T"]
+
+  - name: Parabricks T tsv AML subsample
+    input: tsv_files/{sample}_mutectcaller_t.aml.tsv
+    output: "{sample}/SNV_indels/{sample}_mutectcaller_T.aml.tsv"
+    types: ["T"]
+
+  - name: Parabricks T vcf TM subsample
+    input: "parabricks/pbrun_mutectcaller_t/{sample}_T.vep.include.tm.vcf.gz"
+    output: "{sample}/SNV_indels/{sample}_T.vep.tm.vcf.gz"
+    types: ["T"]  
+
+  - name: Parabricks T tbi TM subsample
+    input: "parabricks/pbrun_mutectcaller_t/{sample}_T.vep.include.tm.vcf.gz.tbi"
+    output: "{sample}/SNV_indels/{sample}_T.vep.tm.vcf.gz.tbi"
+    types: ["T"]
+
+  - name: Parabricks T xlsx
+    input: "export_to_xlsx/t/{sample}_T.snvs.xlsx"
+    output: "{sample}/SNV_indels/{sample}_T.xlsx"
+    types: ["T"]
+
+  - name: Pindel vcf
+    input: "cnv_sv/pindel_vcf/{sample}_T.no_tc.vcf"
+    output: "{sample}/SNV_indels/{sample}.pindel.vcf"
+    types: ["T"]
+
+    # CNV
+  - name: CNVkit vcf
+    input: "cnv_sv/cnvkit_vcf/{sample}_T.pathology.vcf.gz"
+    output: "{sample}/CNV/vcfs/{sample}_T.cnvkit.pathology.vcf.gz"
+    types: ["T"]
+
+  - name: CNVkit tbi
+    input: "cnv_sv/cnvkit_vcf/{sample}_T.pathology.vcf.gz.tbi"
+    output: "{sample}/CNV/vcfs/{sample}_T.cnvkit.pathology.vcf.gz.tbi"
+    types: ["T"]
+
+  - name: GATK vcf
+    input: "cnv_sv/gatk_vcf/{sample}_T.pathology.vcf.gz"
+    output: "{sample}/CNV/vcfs/{sample}_T.gatk.pathology.vcf.gz"
+    types: ["T"]
+  
+  - name: GATK tbi
+    input: "cnv_sv/gatk_vcf/{sample}_T.pathology.vcf.gz.tbi"
+    output: "{sample}/CNV/vcfs/{sample}_T.gatk.pathology.vcf.gz.tbi"
+    types: ["T"]
+  
+  - name: CNVkit xlsx
+    input: "cnv_sv/cnvkit_table/{sample}_T.CNV.xlsx"
+    output: "{sample}/CNV/{sample}_T.CNVkit.xlsx"
+    types: ["T"]
+  
+  - name: CNVkit plot whole
+    input: "cnv_sv/cnvkit_scatter/{sample}_T.png"
+    output: "{sample}/CNV/{sample}_T.png"
+    types: ["T"]
+  
+  - name: CNVkit plot per chr
+    input: "cnv_sv/cnvkit_scatter/{sample}_T_chr{chr}.png"
+    output: "{sample}/CNV/{sample}_T_chr{chr}.png"
+    types: ["T"]
+    
+  - name: CNV report per chr
+    input: "reports/cnv_html_report/{sample}_T.pathology.chr{chr}.cnv_report.html"
+    output: "{sample}/CNV/cnv_html_report/{sample}_T.pathology.chr{chr}.cnv_report.html"
+    types: ["T"]
+  
+  - name: CNV svdb vcf
+    input: "cnv_sv/svdb_query/{sample}_T.pathology.svdb_query.vcf.gz"
+    output: "{sample}/CNV/{sample}_T.pathology.cnv-calls.vcf.gz"
+    types: ["T"]
+  
+  - name: CNV svdb tbi
+    input: "cnv_sv/svdb_query/{sample}_T.pathology.svdb_query.vcf.gz.tbi"
+    output: "{sample}/CNV/{sample}_T.pathology.cnv-calls.vcf.gz.tbi"
+    types: ["T"]
+
+    # DNA fusions
+  - name: DUX4 igh read count
+    input: "reports/dux_read_counts/{sample}_T.dux4_igh.txt"
+    output: "{sample}/DNA_fusions/{sample}_T.dux4_igh_read_count.txt"
+    types: ["T"]
+
+  - name: DUX4 ERG read count
+    input: "reports/dux_read_counts/{sample}_T.dux4_erg.txt"
+    output: "{sample}/DNA_fusions/{sample}_T.dux4_erg_read_count.txt"
+    types: ["T"]
+
+    # SVs
+  - name: Manta T vcf
+    input: "cnv_sv/manta_run_workflow_t/{sample}.ssa.vcf.gz"
+    output: "{sample}/SV/{sample}_manta_T.ssa.vcf.gz"
+    types: ["T"]
+      
+  - name: Manta T tbi
+    input: "cnv_sv/manta_run_workflow_t/{sample}.ssa.vcf.gz.tbi"
+    output: "{sample}/SV/{sample}_manta_T.ssa.vcf.gz.tbi"
+    types: ["T"]
+
+  - name: Manta T vcf ALL subsample
+    input: "cnv_sv/manta_run_workflow_t/{sample}.ssa.include.all.vcf.gz"
+    output: "{sample}/SV/{sample}_manta_T.ssa.all.vcf.gz"
+    types: ["T"]
+
+  - name: Manta T tbi ALL subsample
+    input: "cnv_sv/manta_run_workflow_t/{sample}.ssa.include.all.vcf.gz.tbi"
+    output: "{sample}/SV/{sample}_manta_T.ssa.all.vcf.gz.tbi"
+    types: ["T"]
+
+  - name: Manta T vcf AML subsample
+    input: "cnv_sv/manta_run_workflow_t/{sample}.ssa.include.aml.vcf.gz"
+    output: "{sample}/SV/{sample}_manta_T.ssa.aml.vcf.gz"
+    types: ["T"]
+
+  - name: Manta T tbi AML subsample
+    input: "cnv_sv/manta_run_workflow_t/{sample}.ssa.include.aml.vcf.gz"
+    output: "{sample}/SV/{sample}_manta_T.ssa.aml.vcf.gz.tbi"
+    types: ["T"]
+
+  - name: Manta T tsv del subsample
+    input: "tsv_files/{sample}_manta_t.del.tsv"
+    output: "{sample}/SV/{sample}_manta_T.del.tsv"
+    types: ["T"]
+
+  - name: Manta T tsv ins subsample
+    input: "tsv_files/{sample}_manta_t.ins.tsv"
+    output: "{sample}/SV/{sample}_manta_T.ins.tsv"
+    types: ["T"]
+
+  - name: Manta T tsv dup subsample
+    input: "tsv_files/{sample}_manta_t.dup.tsv"
+    output: "{sample}/SV/{sample}_manta_T.dup.tsv"
+    types: ["T"]
+
+  - name: Manta T tsv bnd subsample
+    input: "tsv_files/{sample}_manta_t.tsv"
+    output: "{sample}/SV/{sample}_manta_T.bnd.tsv"
+    types: ["T"]
+
+  - name: Manta T tsv bnd-all subsample
+    input: "tsv_files/{sample}_manta_t.all.tsv"
+    output: "{sample}/SV/{sample}_manta_T.bnd.all.tsv"
+    types: ["T"]
+
+  - name: Manta T tsv bnd-aml subsample
+    input: "tsv_files/{sample}_manta_t.aml.tsv"
+    output: "{sample}/SV/{sample}_manta_T.bnd.aml.tsv"
+    types: ["T"]
+
+  # Matched samples TN
+    # SNV indels
+  - name: Parabricks TN vcf
+    input: "parabricks/pbrun_mutectcaller_tn/{sample}.vep.vcf.gz"
+    output: "{sample}/SNV_indels/{sample}_TN.vep.vcf.gz"
+    types:  ["TN"]
+  
+  - name: Parabricks TN tbi
+    input: "parabricks/pbrun_mutectcaller_tn/{sample}.vep.vcf.gz.tbi"
+    output: "{sample}/SNV_indels/{sample}_TN.vep.vcf.gz.tbi"
+    types:  ["TN"]
+
+  - name: Parabricks TN vcf ALL subsample
+    input: "parabricks/pbrun_mutectcaller_tn/{sample}.vep.include.all.vcf.gz"
+    output: "{sample}/SNV_indels/{sample}_TN.vep.all.vcf.gz"
+    types: ["TN"]
+  
+  - name: Parabricks TN tbi ALL subsample
+    input: "parabricks/pbrun_mutectcaller_tn/{sample}.vep.include.all.vcf.gz.tbi"
+    output: "{sample}/SNV_indels/{sample}_TN.vep.all.vcf.gz.tbi"
+    types: ["TN"]
+  
+  - name: Parabricks TN tsv ALL subsample
+    input: "tsv_files/{sample}_mutectcaller_tn.all.tsv"
+    output: "{sample}/SNV_indels/{sample}_mutectcaller_TN.all.tsv"
+    types: ["TN"]
+
+  - name: Parabricks TN vcf AML subsample
+    input: parabricks/pbrun_mutectcaller_tn/{sample}.vep.include.aml.vcf.gz
+    output: "{sample}/SNV_indels/{sample}_TN.vep.aml.vcf.gz"
+    types: ["TN"]
+
+  - name: Parabricks TN tbi AML subsample
+    input: parabricks/pbrun_mutectcaller_tn/{sample}.vep.include.aml.vcf.gz.tbi
+    output: "{sample}/SNV_indels/{sample}_TN.vep.aml.vcf.gz.tbi"
+    types: ["TN"]
+
+  - name: Parabricks Tn tsv AML subsample
+    input: "tsv_files/{sample}_mutectcaller_tn.aml.tsv"
+    output: "{sample}/SNV_indels/{sample}_mutectcaller_TN.aml.tsv"
+    types: ["TN"]
+
+  - name: Parabricks TN vcf TM subsample
+    input: parabricks/pbrun_mutectcaller_tn/{sample}.vep.include.tm.vcf.gz
+    output: "{sample}/SNV_indels/{sample}_TN.vep.tm.vcf.gz"
+    types: ["TN"]
+
+  - name: Parabricks TN tbi TM subsample
+    input: parabricks/pbrun_mutectcaller_tn/{sample}.vep.include.tm.vcf.gz.tbi
+    output: "{sample}/SNV_indels/{sample}_TN.vep.tm.vcf.gz.tbi"
+    types: ["TN"]
+
+  - name: Parabricks TN xlsx
+    input: "export_to_xlsx/tn/{sample}.snvs.xlsx"
+    output: "{sample}/SNV_indels/{sample}_TN.xlsx"
+    types: ["TN"]
+
+    # SVs
+  - name: Manta TN vcf
+    input: "cnv_sv/manta_run_workflow_tn/{sample}.ssa.vcf.gz"
+    output: "{sample}/SV/{sample}_manta_TN.ssa.vcf.gz" 
+    types: ["TN"]
+
+  - name: Manta TN tbi
+    input: "cnv_sv/manta_run_workflow_tn/{sample}.ssa.vcf.gz"
+    output: "{sample}/SV/{sample}_manta_TN.ssa.vcf.gz.tbi"
+    types: ["TN"]
+
+  - name: Manta TN vcf ALL subsample
+    input: "cnv_sv/manta_run_workflow_tn/{sample}.ssa.include.all.vcf.gz"
+    output: "{sample}/SV/{sample}_manta_TN.ssa.all.vcf.gz"
+    types: ["TN"]
+
+  - name: Manta TN tbi ALL subsample
+    input: "cnv_sv/manta_run_workflow_tn/{sample}.ssa.include.all.vcf.gz.tbi"
+    output: "{sample}/SV/{sample}_manta_TN.ssa.all.vcf.gz.tbi"
+    types: ["TN"]
+
+  - name: Manta TN vcf AML subsample
+    input: "cnv_sv/manta_run_workflow_tn/{sample}.ssa.include.aml.vcf.gz"
+    output: "{sample}/SV/{sample}_manta_TN.ssa.aml.vcf.gz"
+    types: ["TN"]
+
+  - name: Manta TN tbi AML subsample
+    input: "cnv_sv/manta_run_workflow_tn/{sample}.ssa.include.aml.vcf.gz.tbi"
+    output: "{sample}/SV/{sample}_manta_TN.ssa.aml.vcf.gz.tbi"
+    types: ["TN"]
+
+  - name: Manta TN tsv del subsample
+    input: "tsv_files/{sample}_manta_tn.del.tsv"
+    output: "{sample}/SV/{sample}_manta_TN.del.tsv"
+    types: ["TN"]
+
+  - name: Manta TN tsv ins subsample
+    input: "tsv_files/{sample}_manta_tn.ins.tsv"
+    output: "{sample}/SV/{sample}_manta_TN.ins.tsv"
+    types: ["TN"]
+
+  - name: Manta TN tsv dup subsample
+    input: "tsv_files/{sample}_manta_tn.dup.tsv"
+    output: "{sample}/SV/{sample}_manta_TN.dup.tsv"
+    types: ["TN"]
+
+  - name: Manta TN tsv bnd subsample
+    input: "tsv_files/{sample}_manta_tn.tsv"
+    output: "{sample}/SV/{sample}_manta_TN.bnd.tsv"
+    types: ["TN"]
+
+  - name: Manta TN tsv bnd-all subsample
+    input: "tsv_files/{sample}_manta_tn.all.tsv"
+    output: "{sample}/SV/{sample}_manta_TN.bnd.all.tsv"
+    types: ["TN"]
+
+  - name: Manta TN tsv bnd-aml subsample
+    input: "tsv_files/{sample}_manta_tn.aml.tsv"
+    output: "{sample}/SV/{sample}_manta_TN.bnd.aml.tsv"
+    types: ["TN"]
+
+
+  # RNA
+  - name: MultiQC RNA
+    input: "qc/multiqc/multiqc_RNA.html"
+    output: multiqc_R.html
+    types: ["R"]
+  
+    # alignment
+  - name: RNA bam
+    input: "alignment/star/{sample}_R.bam"
+    output: "{sample}/Cram/{sample}_R.bam"
+    types: ["R"]
+
+  - name: RNA bai
+    input: "alignment/star/{sample}_R.bam.bai"
+    output: "{sample}/Cram/{sample}_R.bam.bai"
+    types: ["R"]
+
+    # Fusions
+  - name: Arriba tsv
+    input: "fusions/arriba/{sample}_R.fusions.tsv"
+    output: "{sample}/RNA_fusions/{sample}_R.arriba.tsv"
+    types: ["R"]
+  
+  - name: Arriba pdf
+    input: fusions/arriba_draw_fusion/{sample}_R.pdf
+    output: "{sample}/RNA_fusions/{sample}_R.arriba.plot.pdf"
+    types: ["R"]
+
+  - name: Star-fusion tsv
+    input: fusions/star_fusion/{sample}_R/star-fusion.fusion_predictions.tsv
+    output: "{sample}/RNA_fusions/{sample}_R.star_fusion.tsv"
+    types: ["R"]
+  
+  - name: Fusioncather tsv
+    input: fusions/fusioncatcher/{sample}_R/final-list_candidate-fusion-genes.txt
+    output: "{sample}/RNA_fusions/{sample}_R.fusioncatcher.tsv"
+    types: ["R"]
+  
+  - name: Fusioncatcher dux4-igh counts
+    input: fusions/fusioncatcher/{sample}_R/dux4-igh_counts.txt
+    output: "{sample}/RNA_fusions/{sample}_R.fusioncatcher_dux4-igh_counts.txt"
+    types: ["R"]
+
+  - name: Fusioncatcher dux4-igh calls
+    input: fusions/fusioncatcher/{sample}_R/dux4-igh_hits.txt
+    output: "{sample}/RNA_fusions/{sample}_R.fusioncatcher_dux4-igh_filtered-results.txt"
+    types: ["R"]
diff --git a/workflow/Snakefile b/workflow/Snakefile
index 1ab64f2..27511d3 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -16,21 +16,22 @@ include: "rules/manta_to_tsv.smk"
 include: "rules/peddy_create_ped.smk"
 
 
+ruleorder: annotation_simple_sv_annotation_tn > misc_bgzip
 ruleorder: cnv_sv_manta_run_workflow_tn > misc_tabix
 ruleorder: cnv_sv_manta_run_workflow_tn > misc_bgzip
 ruleorder: cnv_sv_manta_run_workflow_t > misc_tabix
 ruleorder: cnv_sv_manta_run_workflow_t > misc_bgzip
-ruleorder: _results_manta_tn_tbi > misc_tabix
-ruleorder: _results_manta_t_tbi > misc_tabix
-ruleorder: _results_cnvkit_tbi > misc_tabix
-ruleorder: _results_vcf_tn > misc_bgzip
-ruleorder: _results_gatk_vcf > misc_bgzip
-ruleorder: _results_gatk_vcf_tbi > misc_tabix
-ruleorder: annotation_simple_sv_annotation_tn > misc_bgzip
-ruleorder: _results_pindel_vcf > misc_bgzip
 ruleorder: gatk_model_segments > cnv_sv_gatk_model_segments
-ruleorder: _results_svdb_vcf > misc_bgzip
-ruleorder: _results_svdb_tbi > misc_tabix
+
+ruleorder: copy_parabricks_tn_vcf > misc_bgzip
+ruleorder: copy_manta_tn_tbi > misc_tabix
+ruleorder: copy_manta_t_tbi > misc_tabix
+ruleorder: copy_cnvkit_tbi > misc_tabix
+ruleorder: copy_gatk_vcf > misc_bgzip
+ruleorder: copy_gatk_tbi > misc_tabix
+ruleorder: copy_pindel_vcf > misc_bgzip
+ruleorder: copy_cnv_svdb_vcf > misc_bgzip
+ruleorder: copy_cnv_svdb_tbi > misc_tabix
 
 
 aligner = config.get("aligner", None)
@@ -43,31 +44,30 @@ elif aligner == "bwa_gpu":
     include: "rules/mutectcaller_to_tsv.smk"
     include: "rules/sample_order_multiqc.smk"
 
-    ruleorder: _results_manta_tn_tbi_all > misc_tabix
-    ruleorder: _results_manta_tn_tbi_aml > misc_tabix
-    ruleorder: _results_manta_t_tbi_all > misc_tabix
-    ruleorder: _results_manta_t_tbi_aml > misc_tabix
-    ruleorder: _results_tbi_tn > misc_tabix
-    ruleorder: _results_tbi_all > misc_tabix
-    ruleorder: _results_tbi_aml > misc_tabix
-    ruleorder: _results_crai > misc_samtools_index
-    ruleorder: _results_vcf_t > misc_bgzip
-    ruleorder: _results_tbi_t > misc_tabix
-    ruleorder: _results_tbi_tn > misc_tabix
-    ruleorder: _results_vcf_aml > misc_bgzip
-    ruleorder: _results_vcf_aml_t > misc_bgzip
-    ruleorder: _results_tbi_aml > misc_tabix
-    ruleorder: _results_tbi_aml_t > misc_tabix
-    ruleorder: _results_vcf_all > misc_bgzip
-    ruleorder: _results_tbi_all > misc_tabix
-    ruleorder: _results_vcf_all_t > misc_bgzip
-    ruleorder: _results_tbi_all_t > misc_tabix
-    ruleorder: _results_vcf_tm > misc_bgzip
-    ruleorder: _results_tbi_tm > misc_tabix
-    ruleorder: _results_vcf_tm_t > misc_bgzip
-    ruleorder: _results_tbi_tm_t > misc_tabix
     ruleorder: parabricks_pbrun_fq2bam > alignment_samtools_index
-    ruleorder: _results_star_crai > alignment_samtools_index
+    ruleorder: copy_crai_t_and_n > misc_samtools_index
+    ruleorder: copy_rna_bai > alignment_samtools_index
+
+    ruleorder: copy_parabricks_tn_tbi > misc_tabix
+    ruleorder: copy_parabricks_tn_vcf_all_subsample > misc_bgzip
+    ruleorder: copy_parabricks_tn_tbi_all_subsample > misc_tabix
+    ruleorder: copy_parabricks_tn_vcf_aml_subsample > misc_bgzip
+    ruleorder: copy_parabricks_tn_tbi_aml_subsample > misc_tabix
+    ruleorder: copy_parabricks_tn_vcf_tm_subsample > misc_bgzip
+    ruleorder: copy_parabricks_tn_tbi_tm_subsample > misc_tabix
+    ruleorder: copy_manta_tn_tbi_all_subsample > misc_tabix
+    ruleorder: copy_manta_tn_tbi_aml_subsample > misc_tabix
+
+    ruleorder: copy_parabricks_t_vcf > misc_bgzip
+    ruleorder: copy_parabricks_t_tbi > misc_tabix
+    ruleorder: copy_parabricks_t_vcf_all_subsample > misc_bgzip
+    ruleorder: copy_parabricks_t_tbi_all_subsample > misc_tabix
+    ruleorder: copy_parabricks_t_vcf_aml_subsample > misc_bgzip
+    ruleorder: copy_parabricks_t_tbi_aml_subsample > misc_tabix
+    ruleorder: copy_parabricks_t_vcf_tm_subsample > misc_bgzip
+    ruleorder: copy_parabricks_t_tbi_tm_subsample > misc_tabix
+    ruleorder: copy_manta_t_tbi_all_subsample > misc_tabix
+    ruleorder: copy_manta_t_tbi_aml_subsample > misc_tabix
 
 elif aligner == "bwa_sentieon":
 
@@ -77,7 +77,7 @@ elif aligner == "bwa_sentieon":
 
 rule all:
     input:
-        unpack(compile_output_list),
+        unpack(compile_output_file_list),
 
 
 module annotation:
@@ -187,7 +187,6 @@ use rule star from alignment as alignment_star with:
         log_final=temp("alignment/star/{sample}_{type}.Log.final.out"),
 
 
-
 use rule samtools_index from alignment as alignment_samtools_index
 
 
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index 05fb024..36ddd7d 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -93,7 +93,7 @@ validate(units, schema="../schemas/units.schema.yaml")
 
 ### Read output_files for cp rules
 with open(config["output"]) as output:
-    output_json = json.load(output)
+    output_spec = yaml.safe_load(output.read())
 
 
 ### Set wildcard constraints
@@ -228,15 +228,18 @@ def get_json_for_merge_cnv_json(wildcards):
     return ["reports/cnv_html_report/{sample}_{type}.{caller}.{tc_method}.json".format(caller=c, **wildcards) for c in callers]
 
 
-def compile_output_list(wildcards):
+def compile_output_file_list(wildcards):
+    outdir = pathlib.Path(output_spec["directory"])
     output_files = []
-    types = type_generator(set([unit.type for unit in units.itertuples()]))
+    output_fullpath = []
+
     chromosome_numbers = ["X", "Y"]
     chromosome_numbers.extend(range(1, 23))
-    for output in output_json:
+    for filedef in output_spec["files"]:
+        # add all output that is not TN
         output_files += set(
             [
-                output.format(
+                filedef["output"].format(
                     sample=sample,
                     type=unit_type,
                     chr=chromosome_number,
@@ -247,16 +250,18 @@ def compile_output_list(wildcards):
                 for chromosome_number in chromosome_numbers
                 for sample in get_samples(samples)
                 for unit_type in get_unit_types(units, sample)
-                if unit_type in set(output_json[output]["types"])
+                if unit_type in set(filedef["types"])
                 for flowcell in set([u.flowcell for u in units.loc[(sample, unit_type)].dropna().itertuples()])
                 for barcode in set([u.barcode for u in units.loc[(sample, unit_type)].dropna().itertuples()])
                 for lane in set([u.lane for u in units.loc[(sample, unit_type)].dropna().itertuples()])
             ]
         )
-    for output in output_json:
+
+    # Iterate all files again and add all TN files for samples that have both T and N in units
+    for filedef in output_spec["files"]:
         output_files += set(
             [
-                output.format(
+                filedef["output"].format(
                     sample=sample,
                     type=unit_type,
                     chr=chromosome_number,
@@ -264,45 +269,58 @@ def compile_output_list(wildcards):
                 for chromosome_number in chromosome_numbers
                 for sample in get_samples(samples)
                 for unit_type in type_generator(get_unit_types(units, sample))
-                if unit_type in set(output_json[output]["types"]) and unit_type == "TN"
+                if unit_type in set(filedef["types"]) and unit_type == "TN"
             ]
         )
-    return list(set(output_files))
-
-
-def generate_copy_code(workflow, output_json):
-    code = ""
-    for result, values in output_json.items():
-        if values["file"] is not None:
-            input_file = values["file"]
-            output_file = result
-            rule_name = values["name"]
-            mem_mb = config.get("_copy", {}).get("mem_mb", config["default_resources"]["mem_mb"])
-            mem_per_cpu = config.get("_copy", {}).get("mem_mb", config["default_resources"]["mem_mb"])
-            partition = config.get("_copy", {}).get("partition", config["default_resources"]["partition"])
-            threads = config.get("_copy", {}).get("threads", config["default_resources"]["threads"])
-            time = config.get("_copy", {}).get("time", config["default_resources"]["time"])
-            copy_container = config.get("_copy", {}).get("container", config["default_container"])
-            result_file = os.path.basename(output_file)
-            code += f'@workflow.rule(name="{rule_name}")\n'
-            code += f'@workflow.input("{input_file}")\n'
-            code += f'@workflow.output("{output_file}")\n'
-            if "{chr}" in output_file:
-                code += f'@workflow.log("logs/{rule_name}_{result_file}_chr{{chr}}.log")\n'
-            else:
-                code += f'@workflow.log("logs/{rule_name}_{result_file}.log")\n'
-            code += f'@workflow.container("{copy_container}")\n'
-            code += f'@workflow.resources(time = "{time}", threads = {threads}, mem_mb = {mem_mb}, mem_per_cpu = {mem_per_cpu}, partition = "{partition}")\n'
-            code += '@workflow.shellcmd("cp {input} {output}")\n\n'
-            code += "@workflow.run\n"
-            code += (
-                f"def __rule_{rule_name}(input, output, params, wildcards, threads, resources, log, version, rule, "
-                "conda_env, container_img, singularity_args, use_singularity, env_modules, bench_record, jobid, is_shell, "
-                "bench_iteration, cleanup_scripts, shadow_dir, edit_notebook, conda_base_path, basedir, runtime_sourcecache_path, "
-                "__is_snakemake_rule_func=True):\n"
-                '\tshell ( "(cp --preserve=timestamps {input[0]} {output[0]}) &> {log}" , bench_record=bench_record, bench_iteration=bench_iteration)\n\n'
-            )
-    exec(compile(code, "result_to_copy", "exec"), workflow.globals)
-
-
-generate_copy_code(workflow, output_json)
+    # Add directory to beginning of each outputfile
+    for op in output_files:
+        output_fullpath.append(outdir / Path(op))
+
+    return list(set(output_fullpath))
+
+
+def generate_copy_rules(output_spec):
+    output_directory = pathlib.Path(output_spec["directory"])
+    rulestrings = []
+
+    for f in output_spec["files"]:
+        if f["input"] is None:
+            continue
+
+        rule_name = "copy_{}".format("_".join(re.sub(r"[\"'-.,]", "", f["name"].strip().lower()).split()))
+        input_file = pathlib.Path(f["input"])
+        output_file = output_directory / pathlib.Path(f["output"])
+
+        mem_mb = config.get("_copy", {}).get("mem_mb", config["default_resources"]["mem_mb"])
+        mem_per_cpu = config.get("_copy", {}).get("mem_per_cpu", config["default_resources"]["mem_per_cpu"])
+        partition = config.get("_copy", {}).get("partition", config["default_resources"]["partition"])
+        threads = config.get("_copy", {}).get("threads", config["default_resources"]["threads"])
+        time = config.get("_copy", {}).get("time", config["default_resources"]["time"])
+        copy_container = config.get("_copy", {}).get("container", config["default_container"])
+
+        rule_code = "\n".join(
+            [
+                f'@workflow.rule(name="{rule_name}")',
+                f'@workflow.input("{input_file}")',
+                f'@workflow.output("{output_file}")',
+                f'@workflow.log("logs/{rule_name}_{output_file.name}.log")',
+                f'@workflow.container("{copy_container}")',
+                f'@workflow.resources(time="{time}", threads={threads}, mem_mb="{mem_mb}", '
+                f'mem_per_cpu={mem_per_cpu}, partition="{partition}")',
+                '@workflow.shellcmd("cp --preserve=timestamps -r {input} {output}")',
+                "@workflow.run\n",
+                f"def __rule_{rule_name}(input, output, params, wildcards, threads, resources, "
+                "log, version, rule, conda_env, container_img, singularity_args, use_singularity, "
+                "env_modules, bench_record, jobid, is_shell, bench_iteration, cleanup_scripts, "
+                "shadow_dir, edit_notebook, conda_base_path, basedir, runtime_sourcecache_path, "
+                "__is_snakemake_rule_func=True):",
+                '\tshell("(cp --preserve=timestamps -r {input[0]} {output[0]}) &> {log}", bench_record=bench_record, '
+                "bench_iteration=bench_iteration)\n\n",
+            ]
+        )
+        rulestrings.append(rule_code)
+
+    exec(compile("\n".join(rulestrings), "copy_result_files", "exec"), workflow.globals)
+
+
+generate_copy_rules(output_spec)

From a6291041d00d90d53cd84aa730df0fe1034de7db Mon Sep 17 00:00:00 2001
From: Arielle R Munters <arielle@reivant.se>
Date: Thu, 24 Oct 2024 10:51:58 +0200
Subject: [PATCH 5/7] fix: add ruleorder and update multiqc rna config

---
 config/multiqc_rna_config.yaml | 12 ++++++------
 workflow/Snakefile             |  1 +
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/config/multiqc_rna_config.yaml b/config/multiqc_rna_config.yaml
index d0374b7..71a8c43 100644
--- a/config/multiqc_rna_config.yaml
+++ b/config/multiqc_rna_config.yaml
@@ -67,19 +67,19 @@ custom_table_header_config:
       suffix: ""
     ZERO_CVG_TARGETS_PCT:
       suffix: ""
-    star-total_reads:
+    total_reads:
       suffix: ""
       title: "Total readpairs [M]"
-    star-mapped_percent:
+    mapped_percent:
       suffix: ""
       title: "Aligned [%]"
-    star-uniquely_mapped_percent:
+    uniquely_mapped_percent:
       suffix: ""
       title: "Uniq aligned [%]"
-    star-multimapped:
+    multimapped:
       suffix: ""
       title: "Multimapped [M]"
-    sortmerna-rRNA_pct:
+    rRNA_pct:
       suffix: ""
       title: "rRNA [%]"
 
@@ -128,4 +128,4 @@ table_columns_visible:
 #     MEAN_READ_LENGTH:
 #       title: "Mean Read Length"
 #       description: "Mean read length from Picard Alignment Summary Metrics"
-#       format: "{:.1f}"
\ No newline at end of file
+#       format: "{:.1f}"
diff --git a/workflow/Snakefile b/workflow/Snakefile
index 27511d3..3ce43c6 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -21,6 +21,7 @@ ruleorder: cnv_sv_manta_run_workflow_tn > misc_tabix
 ruleorder: cnv_sv_manta_run_workflow_tn > misc_bgzip
 ruleorder: cnv_sv_manta_run_workflow_t > misc_tabix
 ruleorder: cnv_sv_manta_run_workflow_t > misc_bgzip
+ruleorder: fix_af > filtering_filter_vcf 
 ruleorder: gatk_model_segments > cnv_sv_gatk_model_segments
 
 ruleorder: copy_parabricks_tn_vcf > misc_bgzip

From 41ed4c547fc2069bbe95aa31bb71faf1373840f2 Mon Sep 17 00:00:00 2001
From: Arielle R Munters <arielle@reivant.se>
Date: Thu, 24 Oct 2024 11:10:44 +0200
Subject: [PATCH 6/7] fix: update requirements to hydra genetics 3.0.0

---
 requirements.txt | 66 ++++++++----------------------------------------
 1 file changed, 10 insertions(+), 56 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 55a725a..0df967a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,56 +1,10 @@
-appdirs==1.4.4
-attrs==23.1.0
-certifi==2023.7.22
-charset-normalizer==3.3.2
-click==8.1.7
-colorama==0.4.6
-commonmark==0.9.1
-ConfigArgParse==1.7
-connection-pool==0.0.3
-datrie==0.8.2
-docutils==0.20.1
-dpath==2.1.6
-drmaa==0.7.9
-fastjsonschema==2.18.1
-gitdb==4.0.11
-GitPython==3.1.40
-hydra-genetics==1.3.0
-idna==3.4
-Jinja2==3.0.1
-jsonschema==4.19.2
-jsonschema-specifications==2023.7.1
-jupyter-core==5.5.0
-MarkupSafe==2.1.3
-nbformat==5.9.2
-networkx==3.2.1
-numpy==1.26.1
-pandas==1.5.2
-pip==20.2.4
-plac==1.4.1
-platformdirs==3.11.0
-psutil==5.9.6
-PuLP==2.7.0
-pyaml==23.9.7
-Pygments==2.16.1
-pysam==0.22.0
-python-dateutil==2.8.2
-pytz==2023.3.post1
-PyYAML==6.0.1
-referencing==0.30.2
-requests==2.31.0
-reretry==0.11.8
-rich==10.9.0
-rpds-py==0.12.0
-setuptools==50.3.2
-six==1.16.0
-smart-open==6.4.0
-smmap==5.0.1
-snakemake==7.19.1
-stopit==1.1.2
-tabulate==0.8.10
-throttler==1.2.2
-toposort==1.10
-traitlets==5.13.0
-urllib3==2.0.7
-wrapt==1.15.0
-yte==1.5.1
+pulp<2.8
+hydra-genetics==3.0.0
+pandas>=1.3.1
+snakemake~=7.32
+singularity==3.0.0
+jinja2==3.0.1
+networkx
+pyyaml
+drmaa==0.7.9  # if using drmaa scheduler
+smart_open<7.0.0

From d8c349fa310e84a373a3f401c797913b9f52561f Mon Sep 17 00:00:00 2001
From: Arielle R Munters <arielle@reivant.se>
Date: Thu, 24 Oct 2024 11:28:49 +0200
Subject: [PATCH 7/7] style: edit pycodestyle errors

---
 workflow/scripts/peddy_create_ped.py     | 5 ++++-
 workflow/scripts/sample_order_multiqc.py | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/workflow/scripts/peddy_create_ped.py b/workflow/scripts/peddy_create_ped.py
index d4fd5d7..cbd8581 100644
--- a/workflow/scripts/peddy_create_ped.py
+++ b/workflow/scripts/peddy_create_ped.py
@@ -19,4 +19,7 @@
         else:
             sex = "0"
         with open("qc/peddy/" + line[header_line.index("sample")] + ".peddy.fam", "w+") as pedfile:
-            pedfile.write("\t".join([line[header_line.index("sample")], line[header_line.index("sample")] + "_T", "0", "0", sex, "-9"]) + "\n")
+            pedfile.write(
+                "\t".join([line[header_line.index("sample")], line[header_line.index("sample")] + "_T", "0", "0", sex, "-9"])
+                + "\n"
+            )
diff --git a/workflow/scripts/sample_order_multiqc.py b/workflow/scripts/sample_order_multiqc.py
index 8b084d1..6dcf840 100644
--- a/workflow/scripts/sample_order_multiqc.py
+++ b/workflow/scripts/sample_order_multiqc.py
@@ -15,7 +15,7 @@
     # In case of missing S-index in fastq1-filename set s_index to 99 (last)
     try:
         s_index = int(s_pattern.search(fastq).group(1))
-    except:
+    except AttributeError:
         s_index = 99
     # If same sample sequenced twice use latest runs s_index
     for old_sample, old_s, old_lab, old_type in sample_order_duplicates: