From aa06413befbc812bae78e0787a79128350853245 Mon Sep 17 00:00:00 2001 From: jonca79 Date: Tue, 5 Sep 2023 10:48:27 +0200 Subject: [PATCH 1/3] feat: trim reads to 100bp for improved results in Arriba --- config/config.yaml | 4 +++ config/resources.yaml | 5 ++++ workflow/Snakefile | 61 +++++++++++++++++++++++++++++++++++++++ workflow/rules/common.smk | 8 +++++ 4 files changed, 78 insertions(+) diff --git a/config/config.yaml b/config/config.yaml index e6031856..fb8c076f 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -131,6 +131,10 @@ fastp_pe: # Default enabled trimming parameters for fastp. Specified for clarity. extra: "--trim_poly_g --qualified_quality_phred 15 --unqualified_percent_limit 40 --n_base_limit 5 --length_required 15" +fastp_pe_arriba: + container: "docker://hydragenetics/fastp:0.20.1" + extra: "--max_len1 100" + fastqc: container: "docker://hydragenetics/fastqc:0.11.9" diff --git a/config/resources.yaml b/config/resources.yaml index 6c36ab88..1a709299 100644 --- a/config/resources.yaml +++ b/config/resources.yaml @@ -22,6 +22,11 @@ fastp_pe: mem_mb: 30720 mem_per_cpu: 6144 +fastp_pe_arriba: + threads: 5 + mem_mb: 30720 + mem_per_cpu: 6144 + fuseq_wes: threads: 2 mem_mb: 12288 diff --git a/workflow/Snakefile b/workflow/Snakefile index 067b4829..b4387ca3 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -60,6 +60,60 @@ module prealignment: use rule * from prealignment as prealignment_* +use rule fastp_pe from prealignment as prealignment_fastp_pe_arriba with: + output: + trimmed=temp( + [ + "prealignment/fastp_pe_arriba/{sample}_{type}_{flowcell}_{lane}_{barcode}_fastq1.fastq.gz", + "prealignment/fastp_pe_arriba/{sample}_{type}_{flowcell}_{lane}_{barcode}_fastq2.fastq.gz", + ] + ), + html="prealignment/fastp_pe_arriba/{sample}_{type}_{flowcell}_{lane}_{barcode}_fastp.html", + json="prealignment/fastp_pe_arriba/{sample}_{type}_{flowcell}_{lane}_{barcode}_fastp.json", + params: + adapters=lambda wildcards: " --adapter_sequence {} --adapter_sequence_r2 {} ".format( + *get_fastq_adapter(units, wildcards).split(",") + ), + extra=config.get("fastp_pe_arriba", {}).get("extra", ""), + log: + "prealignment/fastp_pe_arriba/{sample}_{type}_{flowcell}_{lane}_{barcode}_fastq.fastq.gz.log", + benchmark: + repeat( + "prealignment/fastp_pe_arriba/{sample}_{type}_{flowcell}_{lane}_{barcode}_fastq.fastq.gz.benchmark.tsv", + config.get("fastp_pe_arriba", {}).get("benchmark_repeats", 1), + ) + resources: + mem_mb=config.get("fastp_pe_arriba", {}).get("mem_mb", config["default_resources"]["mem_mb"]), + mem_per_cpu=config.get("fastp_pe_arriba", {}).get("mem_per_cpu", config["default_resources"]["mem_per_cpu"]), + partition=config.get("fastp_pe_arriba", {}).get("partition", config["default_resources"]["partition"]), + threads=config.get("fastp_pe_arriba", {}).get("threads", config["default_resources"]["threads"]), + time=config.get("fastp_pe_arriba", {}).get("time", config["default_resources"]["time"]), + threads: config.get("fastp_pe_arriba", {}).get("threads", config["default_resources"]["threads"]) + + +use rule merged from prealignment as prealignment_merged_arriba with: + input: + fastq=merged_input_arriba, + output: + fastq=temp("prealignment/merged_arriba/{sample}_{type}_{read}.fastq.gz"), + log: + "prealignment/merged_arriba/{sample}_{type}_{read}.fastq.gz.log", + benchmark: + repeat( + "prealignment/merged_arriba/{sample}_{type}_{read}.fastq.gz.benchmark.tsv", + config.get("merged_arriba", {}).get("benchmark_repeats", 1), + ) + resources: + mem_mb=config.get("merged_arriba", {}).get("mem_mb", config["default_resources"]["mem_mb"]), + mem_per_cpu=config.get("merged_arriba", {}).get("mem_per_cpu", config["default_resources"]["mem_per_cpu"]), + partition=config.get("merged_arriba", {}).get("partition", config["default_resources"]["partition"]), + threads=config.get("merged_arriba", {}).get("threads", config["default_resources"]["threads"]), + time=config.get("merged_arriba", {}).get("time", config["default_resources"]["time"]), + threads: config.get("merged_arriba", {}).get("threads", config["default_resources"]["threads"]) + container: + config.get("merged_arriba", {}).get("container", config["default_container"]) + + module alignment: snakefile: get_module_snakefile(config, "hydra-genetics/alignment", path="workflow/Snakefile", tag="v0.3.1") @@ -294,6 +348,13 @@ use rule star_fusion from fusions as fusions_star_fusion with: sj=temp("fusions/star_fusion/{sample}_{type}/SJ.out.tab"), +use rule star from fusions as fusions_star with: + input: + fq1="prealignment/merged_arriba/{sample}_{type}_fastq1.fastq.gz", + fq2="prealignment/merged_arriba/{sample}_{type}_fastq2.fastq.gz", + idx=config.get("star", {}).get("genome_index", ""), + + module cnv_sv: snakefile: get_module_snakefile(config, "hydra-genetics/cnv_sv", path="workflow/Snakefile", tag="v0.3.1") diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 15d1573e..9c81665d 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -70,6 +70,14 @@ wildcard_constraints: type="N|T|R", +merged_input_arriba = lambda wildcards: expand( + "prealignment/fastp_pe_arriba/{{sample}}_{{type}}_{flowcell_lane_barcode}_{{read}}.fastq.gz", + flowcell_lane_barcode=[ + "{}_{}_{}".format(unit.flowcell, unit.lane, unit.barcode) for unit in get_units(units, wildcards, wildcards.type) + ], +) + + def compile_output_list(wildcards): output_files = [] types = set([unit.type for unit in units.itertuples()]) From 67f1bf6dd693078d47de9745759e485bcd502239 Mon Sep 17 00:00:00 2001 From: jonca79 Date: Tue, 5 Sep 2023 10:49:04 +0200 Subject: [PATCH 2/3] fix: update snakemake version to avoid checkpoint restart job bug --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index b8b3f0be..7b9c5aaf 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ hydra-genetics==0.15.0 pandas>=1.3.1 -snakemake==7.13.0 +snakemake==7.18.0 singularity==3.0.0 jinja2==3.0.1 networkx From e2949715dbbb33ae8b4d077a34bddc5b1f7f6196 Mon Sep 17 00:00:00 2001 From: jonca79 Date: Tue, 5 Sep 2023 11:09:38 +0200 Subject: [PATCH 3/3] fix: merge two rule definitions into one --- workflow/Snakefile | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index b4387ca3..d3fc0384 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -125,6 +125,10 @@ use rule * from alignment as alignment_* use rule star from alignment as alignment_star with: + input: + fq1="prealignment/merged_arriba/{sample}_{type}_fastq1.fastq.gz", + fq2="prealignment/merged_arriba/{sample}_{type}_fastq2.fastq.gz", + idx=config.get("star", {}).get("genome_index", ""), params: extra=lambda wildcards: "%s %s" % ( @@ -348,13 +352,6 @@ use rule star_fusion from fusions as fusions_star_fusion with: sj=temp("fusions/star_fusion/{sample}_{type}/SJ.out.tab"), -use rule star from fusions as fusions_star with: - input: - fq1="prealignment/merged_arriba/{sample}_{type}_fastq1.fastq.gz", - fq2="prealignment/merged_arriba/{sample}_{type}_fastq2.fastq.gz", - idx=config.get("star", {}).get("genome_index", ""), - - module cnv_sv: snakefile: get_module_snakefile(config, "hydra-genetics/cnv_sv", path="workflow/Snakefile", tag="v0.3.1")