From a4fd5647a0f197266a3a32a430e1e1a202bf4663 Mon Sep 17 00:00:00 2001 From: jonca79 <54137490+jonca79@users.noreply.github.com> Date: Fri, 18 Oct 2024 11:11:39 +0200 Subject: [PATCH 01/14] fix: jumble output is dependent on design bed name --- config/output_reference_files.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/output_reference_files.yaml b/config/output_reference_files.yaml index 3afc91e3..9315171a 100644 --- a/config/output_reference_files.yaml +++ b/config/output_reference_files.yaml @@ -5,7 +5,7 @@ files: types: - N - name: jumble_pon - input: references/jumble_reference/design.bed.reference.RDS + input: references/jumble_reference/pool1_pool2.sort.merged.padded20.cnv200.hg19.split_fusion_genes.reannotated.230222.bed.reference.RDS output: result/jumble.PoN.RDS types: - N From 64db79b22dd09eedb38a091b38b93e7b78d6fea6 Mon Sep 17 00:00:00 2001 From: jonca79 <54137490+jonca79@users.noreply.github.com> Date: Fri, 18 Oct 2024 12:43:40 +0200 Subject: [PATCH 02/14] feat: make jumble reference name based on config --- workflow/rules/common_references.smk | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/workflow/rules/common_references.smk b/workflow/rules/common_references.smk index 82747999..82e7219f 100644 --- a/workflow/rules/common_references.smk +++ b/workflow/rules/common_references.smk @@ -54,11 +54,12 @@ def compile_output_list(wildcards): for filedef in output_spec["files"]: output_files += set( [ - filedef["output"].format(sample=sample, type=unit_type, caller=caller) + filedef["output"].format(sample=sample, type=unit_type, caller=caller, design=design) for sample in get_samples(samples) for unit_type in get_unit_types(units, sample) if unit_type in set(filedef["types"]).intersection(types) for caller in config["bcbio_variation_recall_ensemble"]["callers"] + for design in config["reference"]["design_bed"] ] ) return list(set(output_files)) From 39e9d8f1964f0153be1372614e78b57a933c57af Mon Sep 17 00:00:00 2001 From: jonca79 <54137490+jonca79@users.noreply.github.com> Date: Fri, 18 Oct 2024 12:44:54 +0200 Subject: [PATCH 03/14] fix: rm hardcoded jumble reference name --- config/output_reference_files.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/output_reference_files.yaml b/config/output_reference_files.yaml index 9315171a..988a610e 100644 --- a/config/output_reference_files.yaml +++ b/config/output_reference_files.yaml @@ -5,7 +5,7 @@ files: types: - N - name: jumble_pon - input: references/jumble_reference/pool1_pool2.sort.merged.padded20.cnv200.hg19.split_fusion_genes.reannotated.230222.bed.reference.RDS + input: references/jumble_reference/{design}.reference.RDS output: result/jumble.PoN.RDS types: - N From 67c4be7805a34b21120cfbf4661d245071624daa Mon Sep 17 00:00:00 2001 From: jonca79 <54137490+jonca79@users.noreply.github.com> Date: Fri, 18 Oct 2024 12:50:27 +0200 Subject: [PATCH 04/14] fix: only use file name and not path --- workflow/rules/common_references.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/common_references.smk b/workflow/rules/common_references.smk index 82e7219f..509ec16e 100644 --- a/workflow/rules/common_references.smk +++ b/workflow/rules/common_references.smk @@ -59,7 +59,7 @@ def compile_output_list(wildcards): for unit_type in get_unit_types(units, sample) if unit_type in set(filedef["types"]).intersection(types) for caller in config["bcbio_variation_recall_ensemble"]["callers"] - for design in config["reference"]["design_bed"] + for design in config["reference"]["design_bed"].split("/")[-1] ] ) return list(set(output_files)) From 35e4514931c62486f11f86287bb5e0afea06b8ec Mon Sep 17 00:00:00 2001 From: jonca79 <54137490+jonca79@users.noreply.github.com> Date: Fri, 18 Oct 2024 12:51:20 +0200 Subject: [PATCH 05/14] fix: add design wildcard to output --- config/output_reference_files.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/output_reference_files.yaml b/config/output_reference_files.yaml index 988a610e..6c1b5d96 100644 --- a/config/output_reference_files.yaml +++ b/config/output_reference_files.yaml @@ -6,7 +6,7 @@ files: - N - name: jumble_pon input: references/jumble_reference/{design}.reference.RDS - output: result/jumble.PoN.RDS + output: result/jumble.{design}.PoN.RDS types: - N - name: gatk_pon From 9c5d57016e38db16fb08e27a389c5b53d394e317 Mon Sep 17 00:00:00 2001 From: jonca79 <54137490+jonca79@users.noreply.github.com> Date: Fri, 18 Oct 2024 13:20:02 +0200 Subject: [PATCH 06/14] fix: design name as input to rule all --- workflow/rules/common_references.smk | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/workflow/rules/common_references.smk b/workflow/rules/common_references.smk index 509ec16e..7b757665 100644 --- a/workflow/rules/common_references.smk +++ b/workflow/rules/common_references.smk @@ -54,12 +54,11 @@ def compile_output_list(wildcards): for filedef in output_spec["files"]: output_files += set( [ - filedef["output"].format(sample=sample, type=unit_type, caller=caller, design=design) + filedef["output"].format(sample=sample, type=unit_type, caller=caller, design=config["reference"]["design_bed"].split("/")[-1]) for sample in get_samples(samples) for unit_type in get_unit_types(units, sample) if unit_type in set(filedef["types"]).intersection(types) for caller in config["bcbio_variation_recall_ensemble"]["callers"] - for design in config["reference"]["design_bed"].split("/")[-1] ] ) return list(set(output_files)) From 1b5e076e0e100b58a48a945c9044af102c0076d8 Mon Sep 17 00:00:00 2001 From: jonca79 <54137490+jonca79@users.noreply.github.com> Date: Fri, 18 Oct 2024 13:24:47 +0200 Subject: [PATCH 07/14] style: snakefmt --- workflow/rules/common_references.smk | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/workflow/rules/common_references.smk b/workflow/rules/common_references.smk index 7b757665..ecd1d2f5 100644 --- a/workflow/rules/common_references.smk +++ b/workflow/rules/common_references.smk @@ -54,7 +54,9 @@ def compile_output_list(wildcards): for filedef in output_spec["files"]: output_files += set( [ - filedef["output"].format(sample=sample, type=unit_type, caller=caller, design=config["reference"]["design_bed"].split("/")[-1]) + filedef["output"].format( + sample=sample, type=unit_type, caller=caller, design=config["reference"]["design_bed"].split("/")[-1] + ) for sample in get_samples(samples) for unit_type in get_unit_types(units, sample) if unit_type in set(filedef["types"]).intersection(types) From 68d253a359117ba7728d9c7217d6100fb621ad0a Mon Sep 17 00:00:00 2001 From: jonca79 <54137490+jonca79@users.noreply.github.com> Date: Mon, 21 Oct 2024 12:56:02 +0200 Subject: [PATCH 08/14] fix(report_fusion): fix div by zero --- workflow/scripts/report_fusions.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/workflow/scripts/report_fusions.py b/workflow/scripts/report_fusions.py index 2a560545..0dbe3c10 100644 --- a/workflow/scripts/report_fusions.py +++ b/workflow/scripts/report_fusions.py @@ -190,16 +190,16 @@ if int(Junction_read_count) < housekeeping_genes[gene2][0]: continue # Min AF for frequent FP gene fusions and housekeeping gene - if (gene1 in artefact_gene_dict and gene2 in artefact_gene_dict[gene1]): + if (gene1 in artefact_gene_dict and gene2 in artefact_gene_dict[gene1] and artefact_gene_dict[gene1][gene2][3] > 0): if int(Junction_read_count) / artefact_gene_dict[gene1][gene2][3] < artefact_gene_dict[gene1][gene2][2]: continue - if (gene2 in artefact_gene_dict and gene1 in artefact_gene_dict[gene2]): + if (gene2 in artefact_gene_dict and gene1 in artefact_gene_dict[gene2] and artefact_gene_dict[gene2][gene1][3] > 0): if int(Junction_read_count) / artefact_gene_dict[gene2][gene1][3] < artefact_gene_dict[gene2][gene1][2]: continue - if gene1 in housekeeping_genes: + if gene1 in housekeeping_genes and housekeeping_genes[gene1][3] > 0: if int(Junction_read_count) / housekeeping_genes[gene1][3] < housekeeping_genes[gene1][2]: continue - if gene2 in housekeeping_genes: + if gene2 in housekeeping_genes and housekeeping_genes[gene2][3] > 0: if int(Junction_read_count) / housekeeping_genes[gene2][3] < housekeeping_genes[gene2][2]: continue breakpoint1 = lline[7][:-2] @@ -282,16 +282,16 @@ if int(Spanning_reads_unique) < housekeeping_genes[gene2][1]: continue # Min AF for frequent FP gene fusions and housekeeping gene - if (gene1 in artefact_gene_dict and gene2 in artefact_gene_dict[gene1]): + if (gene1 in artefact_gene_dict and gene2 in artefact_gene_dict[gene1] and artefact_gene_dict[gene1][gene2][3] > 0): if int(Spanning_reads_unique) / artefact_gene_dict[gene1][gene2][3] < artefact_gene_dict[gene1][gene2][2]: continue - if (gene2 in artefact_gene_dict and gene1 in artefact_gene_dict[gene2]): + if (gene2 in artefact_gene_dict and gene1 in artefact_gene_dict[gene2] and artefact_gene_dict[gene2][gene1][3] > 0): if int(Spanning_reads_unique) / artefact_gene_dict[gene2][gene1][3] < artefact_gene_dict[gene2][gene1][2]: continue - if gene1 in housekeeping_genes: + if gene1 in housekeeping_genes and housekeeping_genes[gene1][3] > 0: if int(Spanning_reads_unique) / housekeeping_genes[gene1][3] < housekeeping_genes[gene1][2]: continue - if gene2 in housekeeping_genes: + if gene2 in housekeeping_genes and housekeeping_genes[gene2][3] > 0: if int(Spanning_reads_unique) / housekeeping_genes[gene2][3] < housekeeping_genes[gene2][2]: continue # Flag fusions annotated that are fusions with very high probability From 996891dcc486e4a578f6bc6f8777395bb175720c Mon Sep 17 00:00:00 2001 From: jonca79 <54137490+jonca79@users.noreply.github.com> Date: Tue, 22 Oct 2024 08:46:07 +0200 Subject: [PATCH 09/14] fix(cnvkit_batch): fix input variable name to match wrapper --- workflow/Snakefile_references.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/Snakefile_references.smk b/workflow/Snakefile_references.smk index cfed69cd..89df70b8 100644 --- a/workflow/Snakefile_references.smk +++ b/workflow/Snakefile_references.smk @@ -54,7 +54,7 @@ use rule cnvkit_batch from cnv_sv as cnv_sv_cnvkit_batch with: input: bam="alignment/samtools_merge_bam/{sample}_{type}.bam", bai="alignment/samtools_merge_bam/{sample}_{type}.bam.bai", - cnv_reference="references/cnvkit_build_normal_reference/cnvkit.PoN.cnn", + reference="references/cnvkit_build_normal_reference/cnvkit.PoN.cnn", use rule background_annotation from annotation as annotation_background_annotation with: From 76f2cc8dbe418adb069feae59e89f10ab5a2c588 Mon Sep 17 00:00:00 2001 From: jonca79 Date: Tue, 22 Oct 2024 14:13:25 +0200 Subject: [PATCH 10/14] fix: rm multiple sample lines in MultiQC_RNA --- config/reports/multiqc_config_dna.yaml | 10 ++++++++++ config/reports/multiqc_config_rna.yaml | 19 ++++++++++++++++--- 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/config/reports/multiqc_config_dna.yaml b/config/reports/multiqc_config_dna.yaml index 10752422..9f8233b4 100644 --- a/config/reports/multiqc_config_dna.yaml +++ b/config/reports/multiqc_config_dna.yaml @@ -1,3 +1,13 @@ +title: "Clinical Genomics MultiQC Report" +subtitle: "Reference used: GRCh37" +intro_text: "The MultiQC DNA report summarise analysis results from GMS560 panel data that been analysed by the Twist Solid pipeline (https://github.com/genomic-medicine-sweden/Twist_Solid)." + +report_header_info: + - Contact E-mail: "igp-klinsek-bioinfo@lists.uu.se" + - Application Type: "Bioinformatic analysis of GMS560 panel for solid cancers" + +show_analysis_paths: True + #decimalPoint_format: ',' extra_fn_clean_exts: ##from this until end - '.duplication_metrics' diff --git a/config/reports/multiqc_config_rna.yaml b/config/reports/multiqc_config_rna.yaml index 3f59c443..8a4d9a9f 100644 --- a/config/reports/multiqc_config_rna.yaml +++ b/config/reports/multiqc_config_rna.yaml @@ -1,10 +1,23 @@ +title: "Clinical Genomics MultiQC Report" +subtitle: "Reference used: GRCh37" +intro_text: "The MultiQC RNA report summarise analysis results from GMS560 panel data that been analysed by the Twist Solid pipeline (https://github.com/genomic-medicine-sweden/Twist_Solid)." + +report_header_info: + - Contact E-mail: "igp-klinsek-bioinfo@lists.uu.se" + - Application Type: "Bioinformatic analysis of GMS560 panel for solid cancers" + +show_analysis_paths: True + + #decimalPoint_format: ',' extra_fn_clean_exts: ##from this until end - '.duplication_metrics' - '.HsMetrics' - '.alignment_summary_metrics' + # - type: regex + # pattern: '_fastq[12]' - type: regex - pattern: '_fastq[12]' + pattern: '_S[0-9]' #extra_fn_clean_trim: #if found in beginning or end #fn_ignore_dirs: #fn_ignore_files: @@ -33,8 +46,8 @@ table_columns_visible: "Samtools: stats": error_rate: False non-primary_alignments: False - reads_mapped: False - reads_mapped_percent: False + reads_mapped: True + reads_mapped_percent: True reads_properly_paired_percent: False reads_MQ0_percent: False raw_total_sequences: False From 024ebcf8b6f6976a0ddc886cf53c19757be80275 Mon Sep 17 00:00:00 2001 From: jonca79 Date: Tue, 22 Oct 2024 14:15:38 +0200 Subject: [PATCH 11/14] fix: rm multiple sample lines in MultiQC_RNA --- config/reports/multiqc_config_rna.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/config/reports/multiqc_config_rna.yaml b/config/reports/multiqc_config_rna.yaml index 8a4d9a9f..de8e342d 100644 --- a/config/reports/multiqc_config_rna.yaml +++ b/config/reports/multiqc_config_rna.yaml @@ -14,13 +14,13 @@ extra_fn_clean_exts: ##from this until end - '.duplication_metrics' - '.HsMetrics' - '.alignment_summary_metrics' - # - type: regex - # pattern: '_fastq[12]' - type: regex - pattern: '_S[0-9]' + pattern: '_fastq[12]' #extra_fn_clean_trim: #if found in beginning or end #fn_ignore_dirs: -#fn_ignore_files: +fn_ignore_files: + - type: regex + pattern: '_S[0-9]' use_filename_as_sample_name: - picard/hsmetrics From 0470ecbf13dc09bc72a42b543cdda562048cd404 Mon Sep 17 00:00:00 2001 From: jonca79 Date: Tue, 22 Oct 2024 15:56:50 +0200 Subject: [PATCH 12/14] fix: rm multiple sample lines in MultiQC_RNA --- config/reports/multiqc_config_rna.yaml | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/config/reports/multiqc_config_rna.yaml b/config/reports/multiqc_config_rna.yaml index de8e342d..26f998cc 100644 --- a/config/reports/multiqc_config_rna.yaml +++ b/config/reports/multiqc_config_rna.yaml @@ -16,11 +16,13 @@ extra_fn_clean_exts: ##from this until end - '.alignment_summary_metrics' - type: regex pattern: '_fastq[12]' -#extra_fn_clean_trim: #if found in beginning or end -#fn_ignore_dirs: -fn_ignore_files: - type: regex pattern: '_S[0-9]' + - type: regex + pattern: '_R[12]' +#extra_fn_clean_trim: #if found in beginning or end +#fn_ignore_dirs: +#fn_ignore_files: use_filename_as_sample_name: - picard/hsmetrics From 94f445ffb7e9e46dd916370a1fa5f0117bd1506c Mon Sep 17 00:00:00 2001 From: jonca79 Date: Tue, 22 Oct 2024 16:13:42 +0200 Subject: [PATCH 13/14] fix: rm multiple sample lines in MultiQC_RNA --- config/reports/multiqc_config_rna.yaml | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/config/reports/multiqc_config_rna.yaml b/config/reports/multiqc_config_rna.yaml index 26f998cc..22cca865 100644 --- a/config/reports/multiqc_config_rna.yaml +++ b/config/reports/multiqc_config_rna.yaml @@ -14,12 +14,8 @@ extra_fn_clean_exts: ##from this until end - '.duplication_metrics' - '.HsMetrics' - '.alignment_summary_metrics' - - type: regex - pattern: '_fastq[12]' - - type: regex - pattern: '_S[0-9]' - - type: regex - pattern: '_R[12]' + - type: regex_keep + pattern: '[0-9A-Z-]+' #extra_fn_clean_trim: #if found in beginning or end #fn_ignore_dirs: #fn_ignore_files: From 733ac6996c70175c8604e1ba55245b0bcc6b67fb Mon Sep 17 00:00:00 2001 From: jonca79 <54137490+jonca79@users.noreply.github.com> Date: Wed, 23 Oct 2024 11:23:15 +0200 Subject: [PATCH 14/14] fix: sample mixup in percentage --- workflow/scripts/sample_mixup_check.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/scripts/sample_mixup_check.py b/workflow/scripts/sample_mixup_check.py index 46048cf3..3e2db10b 100644 --- a/workflow/scripts/sample_mixup_check.py +++ b/workflow/scripts/sample_mixup_check.py @@ -65,7 +65,7 @@ def read_vcf(vcf_filename, vcf_dict, samples): if rna_samples[rna_sample][dna_sample] > best_gt_match: best_dna_sample = dna_sample best_gt_match = rna_samples[rna_sample][dna_sample] - p_match = round(best_gt_match / 42.0, 1) + p_match = round(best_gt_match * 100 / 42.0, 1) report.write(f"{rna_sample}\t{best_dna_sample}\t{best_gt_match}\t{p_match}%\t") if p_match > match_cutoff: report.write(f"yes\n")