Merge pull request #13 from PathoGenOmics-Lab/feature-slurm

Add basic SLURM compatibility and logging
PathoGenOmics-Lab · Aug 7, 2023 · b805c03 · b805c03
2 parents e8bec03 + 359001c
commit b805c03
Show file tree

Hide file tree

Showing 35 changed files with 232 additions and 55 deletions.
diff --git a/README.md b/README.md
@@ -2,33 +2,65 @@
 
 ![Install workflow](https://github.com/PathoGenOmics-Lab/Case-study-SARS-CoV-2/actions/workflows/install.yml/badge.svg)
 ![Test workflow](https://github.com/PathoGenOmics-Lab/Case-study-SARS-CoV-2/actions/workflows/test.yml/badge.svg)
+[![Snakemake](https://img.shields.io/badge/snakemake-≥7.19-brightgreen.svg?style=flat)](https://snakemake.readthedocs.io)
 
 ## Instructions
 
-To run the pipeline, first fetch the data (you may need to modify the script to include your credentials):
+To run the pipeline, you will need an environment with `snakemake`
+(check [the Snakemake docs](https://snakemake.readthedocs.io/en/stable/getting_started/installation.html)).
+
+Then, modify the [target configuration file](config/targets.yaml)
+to point to your data. It should look like this:
+
+```yaml
+SAMPLES:
+  sample1:
+    bam: "path/to/sorted/bam1.bam"
+    fasta: "path/to/sequence1.fasta"
+  sample2:
+    bam: "path/to/sorted/bam2.bam"
+    fasta: "path/to/sequence2.fasta"
+  ...
+METADATA:
+  "path/to/metadata.csv"
+OUTPUT_DIRECTORY:
+  "output"
+CONTEXT_FASTA:
+  null
+```
+
+You may also provide these information through the `--config` parameter.
+
+Setting `CONTEXT_FASTA` to `null` will enable automatic download of sequences
+from the GISAID SARS-CoV-2 database
+(see [the following section](README.md#context-checkpoints) for further details).
+To enable this, you must provide your GISAID credentials by creating and
+filling an additional configuration file `config/gisaid.yaml` as follows:
 
-```bash
-./fetch_data.sh
+```yaml
+USERNAME: "your-username"
+PASSWORD: "your-password"
 ```
 
-Then, within an environment with `snakemake>6.0`
-(see [the Snakemake docs](https://snakemake.readthedocs.io/en/stable/getting_started/installation.html)),
-run the following:
+To run the analysis with the default configuration, just run the following command
+(change the `-c/--cores` argument to use a different number of CPUs):
 
-```bash
+```shell
 snakemake --use-conda -c8
 ```
 
-You may change the `-c/--cores` argument to use a different number of CPUs.
+To run the analysis in an HPC environment using SLURM, we provide a
+[default profile configuration](profile/default/config.yaml) that adapt
+to your needs or directly use of by running the following command:
 
-## Context checkpoints
+```shell
+snakemake --slurm --use-conda --profile profile/default
+```
 
-By default, a dataset of samples matching the location and time window
-of the target samples will be fetched from the GISAID database
+## Context checkpoints
 
-By default, the pipeline starts by selecting samples that meet the spatial
-and temporal criteria (see the [`download_context`](workflow/rules/context.smk)
-rule for reference):
+By default, a dataset of samples that meet the spatial
+and temporal criteria set through the [`download_context` rule](workflow/rules/context.smk):
 
 - Location matching the place(s) of sampling of the target samples
 - Collection date within the time window that includes 95% of the date distribution of the
@@ -43,15 +75,15 @@ If these requirements are not met, a custom sequence dataset must be
 provided through the `CONTEXT_FASTA` parameter by editing [the target configuration file](config/targets.yaml)
 or via the command line:
 
-```bash
+```shell
 snakemake --config CONTEXT_FASTA="path/to/fasta"
 ```
 
 ## Workflow graphs
 
 To generate a simplified rule graph, run:
 
-```bash
+```shell
 snakemake --rulegraph | dot -Tpng > .rulegraph.png
 ```
 
@@ -60,7 +92,7 @@ snakemake --rulegraph | dot -Tpng > .rulegraph.png
 To generate the directed acyclic graph (DAG) of all rules
 to be executed, run:
 
-```bash
+```shell
 snakemake --forceall --dag | dot -Tpng > .dag.png
 ```
 

diff --git a/config/config.yaml b/config/config.yaml
@@ -24,3 +24,5 @@ GISAID_YAML:
   "config/gisaid.yaml"
 DIVERSITY_BOOTSTRAP_REPS:
   1000
+LOG_PY_FMT:
+  "%(asctime)s - %(name)s - %(levelname)-8s - %(message)s"
diff --git a/fetch_data.sh b/fetch_data.sh
diff --git a/profile/default/config.yaml b/profile/default/config.yaml
@@ -0,0 +1,10 @@
+default-resources:
+  - slurm_account="pgo"
+  - slurm_partition="global"
+  - mem="4000MB"
+  - runtime="30m"
+  - slurm_extra="--qos=short"
+jobs: 25
+shadow-prefix: "/scr/aherrera/"
+rerun-incomplete: True
+restart-times: 5
diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -37,6 +37,11 @@ if "CONTEXT_FASTA" not in config.keys() or config["CONTEXT_FASTA"] is None:
 else:
     CONTEXT_FASTA = config["CONTEXT_FASTA"]
 
+
+## Logging
+LOGDIR = OUTDIR / "logs"
+
+
 ## report
 REPORT_DIR = Path(OUTDIR/"report")
 REPORT_DIR.mkdir(parents=True, exist_ok=True)

diff --git a/workflow/rules/asr.smk b/workflow/rules/asr.smk
@@ -10,13 +10,15 @@ rule reconstruct_ancestral_sequence:
     output:
         folder = directory(OUTDIR/"tree"),
         state_file = OUTDIR/"tree"/f"{OUTPUT_NAME}.state"
+    log:
+        LOGDIR / "reconstruct_ancestral_sequence" / "log.txt"
     shell:
         """
         mkdir -p {output.folder}
         iqtree2 \
             {params.etc} -asr \
             -o {config[ALIGNMENT_REFERENCE]} -T AUTO --threads-max {threads} -s {input.fasta} \
-            --seqtype {params.seqtype} -m {config[TREE_MODEL]} --prefix {output.folder}/{params.name}
+            --seqtype {params.seqtype} -m {config[TREE_MODEL]} --prefix {output.folder}/{params.name} >{log} 2>&1
         """
 
 
@@ -31,6 +33,8 @@ rule ancestor_fasta:
         state_file = OUTDIR/"tree"/f"{OUTPUT_NAME}.state"
     output:
         fasta = report(OUTDIR/f"{OUTPUT_NAME}.ancestor.fasta")
+    log:
+        LOGDIR / "ancestor_fasta" / "log.txt"
     script:
         "../scripts/ancestor_fasta.py"
 
@@ -51,8 +55,13 @@ rule ml_context_tree:
         folder = directory(OUTDIR/"tree_context"),
         state_file = OUTDIR/"tree_context"/f"{OUTPUT_NAME}.state",
         ml = OUTDIR/f"tree_context/{OUTPUT_NAME}.treefile"
+    log:
+        LOGDIR / "ml_context_tree" / "log.txt"
     shell:
         """
+        exec >{log}                                                                    
+        exec 2>&1
+        
         awk '/^>/{{p=seen[$0]++}}!p' {input.fasta} {input.outgroup_aln} > aln.fasta
         mkdir -p {output.folder}
         iqtree2 \

diff --git a/workflow/rules/context.smk b/workflow/rules/context.smk
@@ -25,6 +25,8 @@ rule download_context:
         fasta = temp(OUTDIR/"context"/"sequences.fasta"),
         metadata = temp(OUTDIR/"context"/"metadata.csv"),
         duplicate_accids = OUTDIR/"context"/"duplicate_accession_ids.txt"
+    log:
+        LOGDIR / "download_context" / "log.txt"
     script:
         "../scripts/download_context.R"
 
@@ -41,8 +43,10 @@ rule align_context:
     output:
         folder = directory(OUTDIR/"context"/"nextalign"),
         fasta = OUTDIR/"context"/"nextalign"/"context_sequences.aligned.fasta"
+    log:
+        LOGDIR / "align_context" / "log.txt"
     shell:
-        "nextalign run -j {threads} -O {output.folder} -o {output.fasta} -n {params.name} --include-reference -r {input.ref_fasta} {input.fasta}"
+        "nextalign run -j {threads} -O {output.folder} -o {output.fasta} -n {params.name} --include-reference -r {input.ref_fasta} {input.fasta} >{log} 2>&1"
 
 
 rule mask_context:
@@ -57,5 +61,7 @@ rule mask_context:
         ref_fasta = OUTDIR/"reference.fasta"
     output:
         fasta = OUTDIR/"context"/"nextalign"/"context_sequences.aligned.masked.fasta"
+    log:
+        LOGDIR / "mask_context" / "log.txt"
     script:
         "../scripts/mask-aln.py"
diff --git a/workflow/rules/demix.smk b/workflow/rules/demix.smk
@@ -1,7 +1,7 @@
 rule demix_preprocessing:
     threads: 1
     conda: "../envs/freyja.yaml"
-    shadow: "full"
+    shadow: "minimal"
     input:
         bam = get_input_bam,
         ref_fasta = OUTDIR/"mapping_references.fasta"
@@ -10,21 +10,23 @@ rule demix_preprocessing:
     output:
         depth_file = OUTDIR/"demixing"/"{sample}/{sample}_depth.txt",
         variants_file = OUTDIR/"demixing"/"{sample}/{sample}_variants.tsv"
+    log:
+        LOGDIR / "demix_preprocessing" / "{sample}.log.txt"
     shell:
         """
         freyja variants \
             "{input.bam}" \
             --variants {output.variants_file} \
             --depths {output.depth_file} \
             --minq {params.minq} \
-            --ref {input.ref_fasta}
+            --ref {input.ref_fasta} >{log} 2>&1
         """
 
 
 rule demix:
     threads: 1
     conda: "../envs/freyja.yaml"
-    shadow: "full"
+    shadow: "minimal"
     input:
         depth_file = OUTDIR/"demixing"/"{sample}/{sample}_depth.txt",
         variants_file = OUTDIR/"demixing"/"{sample}/{sample}_variants.tsv"
@@ -33,14 +35,16 @@ rule demix:
         minimum_abundance = config["DEMIX"]["MIN_ABUNDANCE"]
     output:
         demix_file = OUTDIR/"demixing"/"{sample}/{sample}_demixed.tsv"
+    log:
+        LOGDIR / "demix" / "{sample}.log.txt"
     shell:
         """
         freyja demix \
             "{input.variants_file}" \
             "{input.depth_file}" \
             --eps {params.minimum_abundance} \
             --covcut {params.coverage_cutoff} \
-            --output {output.demix_file}
+            --output {output.demix_file} >{log} 2>&1
         """
 
 
@@ -52,5 +56,7 @@ rule summarise_demixing:
         tables = expand(OUTDIR/"demixing"/"{sample}/{sample}_demixed.tsv", sample=iter_samples())
     output:
         summary_df = report(OUTDIR/"summary_freyja_demixing.csv")
+    log:
+        LOGDIR / "summarise_demixing" / "log.txt"
     script: 
         "../scripts/summary_demixing.R"
diff --git a/workflow/rules/distances.smk b/workflow/rules/distances.smk
@@ -9,5 +9,7 @@ rule weighted_distances:
         tsv = OUTDIR/f"{OUTPUT_NAME}.masked.filtered.tsv"
     output:
         distances = OUTDIR/f"{OUTPUT_NAME}.weighted_distances.csv"
+    log:
+        LOGDIR / "weighted_distances" / "log.txt"
     script:
         "../scripts/weighted_distances.py"
diff --git a/workflow/rules/evolution.smk b/workflow/rules/evolution.smk
@@ -7,5 +7,7 @@ rule N_S_sites:
         fasta = OUTDIR/f"{OUTPUT_NAME}.ancestor.fasta"
     output:
         csv = OUTDIR/f"{OUTPUT_NAME}.ancestor.N_S.sites.csv"
+    log:
+        LOGDIR / "N_S_sites" / "log.txt"
     script:
         "../scripts/N_S_sites_from_fasta.py"
diff --git a/workflow/rules/fasta.smk b/workflow/rules/fasta.smk
@@ -3,8 +3,10 @@ rule rename_fastas:
         fasta = get_input_fasta
     output:
         renamed = temp(OUTDIR/"renamed.{sample}.fasta")
+    log:
+        LOGDIR / "rename_fastas" / "{sample}.log.txt"
     shell:
-        "sed 's/>.*/>'{wildcards.sample}'/g' {input.fasta} > {output.renamed}"
+        "sed 's/>.*/>'{wildcards.sample}'/g' {input.fasta} > {output.renamed} 2> {log}"
 
 
 
@@ -15,8 +17,10 @@ rule concat_fasta:
         expand(OUTDIR/"renamed.{sample}.fasta", sample = iter_samples())
     output:
         fasta = OUTDIR/f"{OUTPUT_NAME}.fasta"
+    log:
+        LOGDIR / "concat_fasta" / "log.txt"
     shell:
-        "cat {input} > {output.fasta}"
+        "cat {input} > {output.fasta} 2> {log}"
 
 
 rule align_fasta:
@@ -31,8 +35,10 @@ rule align_fasta:
     output:
         folder = directory(OUTDIR/"nextalign"),
         fasta = OUTDIR/"nextalign"/f"{OUTPUT_NAME}.aligned.fasta"
+    log:
+        LOGDIR / "align_fasta" / "log.txt"
     shell:
-        "nextalign run -j {threads} -O {output.folder} -o {output.fasta} -n {params.name} --include-reference -r {input.ref_fasta} {input.fasta}"
+        "nextalign run -j {threads} -O {output.folder} -o {output.fasta} -n {params.name} --include-reference -r {input.ref_fasta} {input.fasta} >{log} 2>&1"
 
 
 rule mask_alignment:
@@ -47,5 +53,7 @@ rule mask_alignment:
         ref_fasta = OUTDIR/"reference.fasta"
     output:
         fasta = OUTDIR/"nextalign"/f"{OUTPUT_NAME}.aligned.masked.fasta"
+    log:
+        LOGDIR / "mask_alignment" / "log.txt"
     script:
         "../scripts/mask-aln.py"
diff --git a/workflow/rules/fetch.smk b/workflow/rules/fetch.smk
@@ -3,22 +3,26 @@ rule fetch_alignment_reference:
     conda: "../envs/fetch.yaml"
     output:
         fasta = OUTDIR/"reference.fasta"
+    log:
+        LOGDIR / "fetch_alignment_reference" / "log.txt"
     shell:
-        "esearch -db nucleotide -query {config[ALIGNMENT_REFERENCE]} | efetch -format fasta > {output.fasta}"
+        "esearch -db nucleotide -query {config[ALIGNMENT_REFERENCE]} | efetch -format fasta > {output.fasta} 2> {log}"
 
 
 rule fetch_mapping_references:
     threads: 1
     conda: "../envs/fetch.yaml"
     output:
         fasta = OUTDIR/"mapping_references.fasta"
+    log:
+        LOGDIR / "fetch_mapping_references" / "log.txt"
     shell:
         """
         # Create empty file
         echo -n > {output.fasta}
         # Download and concatenate each reference
         echo {config[MAPPING_REFERENCES]} | while read -r ref_id; do
-            esearch -db nucleotide -query "$ref_id" | efetch -format fasta >> {output.fasta}
+            esearch -db nucleotide -query "$ref_id" | efetch -format fasta >> {output.fasta} 2>> {log}
         done
         """
 
@@ -27,5 +31,7 @@ rule fetch_alignment_annotation:
     threads: 1
     output:
         OUTDIR/"reference.gff3"
+    log:
+        LOGDIR / "fetch_alignment_annotation" / "log.txt"
     shell:
-        "wget -O {output} 'https://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?db=nuccore&report=gff3&id={config[ALIGNMENT_REFERENCE]}'"
+        "wget -O {output} 'https://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?db=nuccore&report=gff3&id={config[ALIGNMENT_REFERENCE]}' 2>{log}"
diff --git a/workflow/rules/pangolin.smk b/workflow/rules/pangolin.smk
@@ -5,7 +5,9 @@ rule pangolin_report:
         fastas = OUTDIR/f"{OUTPUT_NAME}.fasta"
     output:
         report = report(OUTDIR/f"{OUTPUT_NAME}.lineage_report.csv")
+    log:
+        LOGDIR / "pangolin_report" / "log.txt"
     shell:
         """
-        pangolin {input.fastas} --outfile {output.report} -t {threads}
+        pangolin {input.fastas} --outfile {output.report} -t {threads} >{log} 2>&1
         """