Merge pull request #40 from apriltuesday/EVA-3330

EVA-3330: Add labels to nextflow for SLURM migration
EBIvariation · Jun 14, 2024 · 2091345 · 2091345
2 parents a0eb79c + fe250f7
commit 2091345
Show file tree

Hide file tree

Showing 10 changed files with 99 additions and 25 deletions.
diff --git a/.github/workflows/variant_remapping.yml b/.github/workflows/variant_remapping.yml
@@ -11,7 +11,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [3.7]
+        python-version: [3.8]
 
     steps:
     - uses: actions/checkout@v2
@@ -29,6 +29,8 @@ jobs:
         echo "/tmp/nextflow" >> $GITHUB_PATH
         cd -
         # $CONDA is an environment variable pointing to the root of the miniconda directory
+        $CONDA/bin/conda update conda
+        $CONDA/bin/conda install -y python=${{ matrix.python-version }}
         $CONDA/bin/conda env update -q --file conda.yml --name base
         $CONDA/bin/conda run pip install -q -r requirements.txt
 

diff --git a/README.md b/README.md
@@ -64,3 +64,11 @@ Other files are created alongside the main output:
 - `<output>_nra_variants.vcf` variants successfully remap that landed in a position where the reference allele changed. The output contains the original variant and the original reference allele as alternate.
 - `<output>_unmapped.vcf` original variant that could not be successfully remap
 - `<output>_count.yml` YAML file containing counts associated with each round of remapping
+
+## Configuration
+
+The pipeline relies on Nextflow configuration to set memory and runtime requirements. This is not required for all users, but it is recommended particularly for HPC and cloud environments.
+
+There is an [example config](tests/resources/nextflow.config) used for tests that you can modify for your own needs. The main features are the use of labels to group processes into different categories based on their resource needs (small/medium/large), and the use of `base_memory` and `base_time` variables that some processes use to fine-tune their requirements.
+
+For more about Nextflow configuration, see the [documentation](https://www.nextflow.io/docs/latest/config.html).
diff --git a/conda.yml b/conda.yml
@@ -1,8 +1,8 @@
 name: variant-remapping
 channels:
-  - defaults
   - conda-forge
   - bioconda
+  - defaults
 dependencies:
   - bedtools
   - minimap2

diff --git a/main.nf b/main.nf
@@ -1,6 +1,5 @@
 #!/usr/bin/env nextflow
 
-
 // Enable syntax extension
 // See https://www.nextflow.io/docs/latest/dsl2.html
 nextflow.enable.dsl=2
@@ -46,6 +45,7 @@ outfile_dir = file(params.outfile).getParent()
  * Uncompress VCF file
  */
 process uncompressInputVCF {
+    label 'short_time', 'med_mem'
 
     input:
         path "source.vcf"
@@ -69,6 +69,7 @@ process uncompressInputVCF {
  * filter VCF file to remove variant too close the edges of chromosome because we can't get flanking regions
  */
 process filterInputVCF {
+    label 'default_time', 'med_mem'
 
     input:
         path "source.vcf"
@@ -94,6 +95,7 @@ process filterInputVCF {
  * Store the original VCF header for later use
  */
 process storeVCFHeader {
+    label 'short_time', 'small_mem'
 
     input:
         path "source.vcf"
@@ -114,6 +116,7 @@ include { process_split_reads; process_split_reads_mid; process_split_reads_long
  * This process convert the original Header to the remapped header and concatenate it with the remapped VCF records
  */
 process generateRemappedVCF {
+    label 'short_time', 'small_mem'
 
     input:
         path "vcf_header.txt"
@@ -148,6 +151,7 @@ process generateRemappedVCF {
  * This process adds the original header to unmapped variant VCF records and output the results
  */
 process generateUnmappedVCF {
+    label 'short_time', 'small_mem'
 
     publishDir outfile_dir,
         overwrite: true,
@@ -170,6 +174,7 @@ process generateUnmappedVCF {
  * Sort VCF file
  */
 process sortVCF {
+    label 'default_time', 'med_mem'
 
     input:
         path "variants_remapped.vcf"
@@ -187,6 +192,7 @@ process sortVCF {
  * Run bcftools norm to swap the REF and ALT alleles if the REF doesn't match the new assembly
  */
 process normalise {
+    label 'default_time', 'med_mem'
 
     input:
         path "variants_remapped_sorted.vcf.gz"
@@ -202,6 +208,7 @@ process normalise {
 
 
 process collectNovelReferenceAlleles {
+    label 'short_time', 'small_mem'
 
     publishDir outfile_dir,
         overwrite: true,
@@ -224,6 +231,7 @@ process collectNovelReferenceAlleles {
  * Create file containing remapping stats
  */
 process outputStats {
+    label 'short_time', 'small_mem'
 
     publishDir outfile_dir,
         overwrite: true,
@@ -244,6 +252,8 @@ process outputStats {
  * Concatenate the unmapped variants
  */
 process combineUnmappedVCF {
+    label 'short_time', 'small_mem'
+
     input:
         path "variants1.vcf"
         path "variants2.vcf"
@@ -258,6 +268,8 @@ process combineUnmappedVCF {
 
 
 process combineVCF {
+    label 'short_time', 'small_mem'
+
     input:
         path "variants1.vcf"
         path "variants2.vcf"
@@ -271,6 +283,8 @@ process combineVCF {
 }
 
 process combineYaml {
+    label 'short_time', 'small_mem'
+
     input:
         path "initial_yml"
         path "round1.yml"

diff --git a/prepare_genome.nf b/prepare_genome.nf
@@ -9,8 +9,11 @@ nextflow.enable.dsl=2
  * Index the new reference genome using bowtie_build
  */
 process bowtieGenomeIndex {
+    label 'med_time'
+
     // Memory required is 10 times the size of the fasta in Bytes or at least 1GB
-    memory Math.max(file(params.newgenome).size() * 10, 1073741824) + ' B'
+    // Overwrite base_memory so that the standard retry strategy is used
+    ext base_memory: { Math.max(file(params.newgenome).size() * 10, 1073741824) } 
 
     input:
         path "genome_fasta"
@@ -25,6 +28,7 @@ process bowtieGenomeIndex {
 
 
 process samtoolsFaidx {
+    label 'med_time', 'med_mem'
 
     input:
         path "genome_basename"
@@ -41,6 +45,7 @@ process samtoolsFaidx {
  * Extract chomosome/contig sizes
  */
 process chromSizes {
+    label 'short_time', 'small_mem'
 
     input:
         path "genome.fa.fai"

diff --git a/tests/resources/config.yml b/tests/resources/config.yml
diff --git a/tests/resources/nextflow.config b/tests/resources/nextflow.config
@@ -0,0 +1,46 @@
+
+executor {
+    name = 'local'
+}
+
+process.ext.base_memory = 6.GB
+process.ext.base_time = 10.minutes
+
+process {
+    executor = 'local'
+
+    // Dynamic resource allocation with retries
+    errorStrategy = 'retry'
+    maxRetries = 1
+    memory = { task.ext.base_memory * task.attempt }
+    time = { task.ext.base_time * task.attempt }
+
+    // Labels for specific runtimes
+    withLabel: short_time {
+        ext.base_time = 5.minutes
+    }
+    withLabel: default_time {
+        ext.base_time = 10.minutes
+    }
+    withLabel: med_time {
+        ext.base_time = 30.minutes
+    }
+    withLabel: long_time {
+        ext.base_time = 1.hour
+    }
+
+    // Labels for specific memory usage
+    withLabel: small_mem {
+        ext.base_memory = 1.GB
+    }
+    withLabel: default_mem {
+        ext.base_memory = 6.GB
+    }
+    withLabel: med_mem {
+        ext.base_memory = 8.GB
+    }
+    withLabel: big_mem {
+        ext.base_memory = 10.GB
+    }
+
+}
diff --git a/tests/test_pipeline.sh b/tests/test_pipeline.sh
@@ -33,7 +33,7 @@ chr1	3710	.	T	A	50	PASS	.	GT:GQ	1/1:0
 EOT
 
 nextflow run ${SOURCE_DIR}/main.nf \
--config ${SCRIPT_DIR}/resources/config.yml \
+-config ${SCRIPT_DIR}/resources/nextflow.config \
 --oldgenome ${SCRIPT_DIR}/resources/genome.fa \
 --newgenome ${SCRIPT_DIR}/resources/new_genome.fa \
 --vcffile ${SCRIPT_DIR}/resources/source.vcf \

diff --git a/tests/test_pipeline_empty.sh b/tests/test_pipeline_empty.sh
@@ -20,11 +20,11 @@ cat << EOT > "${SCRIPT_DIR}/resources/source_empty.vcf"
 ##INFO=<ID=COMMENT,Number=1,Type=String,Description="Comment">
 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Consensus Genotype across all datasets with called genotype">
 ##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
-#CHROM	POS	ID	REF	 ALT	QUAL 	FILTER	INFO	FORMAT	HG001
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	HG001
 EOT
 
 nextflow run ${SOURCE_DIR}/main.nf \
--config ${SCRIPT_DIR}/resources/config.yml \
+-config ${SCRIPT_DIR}/resources/nextflow.config \
 --oldgenome ${SCRIPT_DIR}/resources/genome.fa \
 --newgenome ${SCRIPT_DIR}/resources/new_genome.fa \
 --vcffile ${SCRIPT_DIR}/resources/source_empty.vcf \
@@ -52,6 +52,7 @@ rm -rf work .nextflow* \
        ${SCRIPT_DIR}/resources/source_empty.vcf \
        ${SCRIPT_DIR}/resources/expected_remap.vcf \
        ${SCRIPT_DIR}/resources/remap_empty.vcf \
+       ${SCRIPT_DIR}/resources/remap_empty_nra_variants.vcf \
        ${SCRIPT_DIR}/resources/remap_empty_counts.yml \
        ${SCRIPT_DIR}/resources/remap_empty_unmapped.vcf \
        ${SCRIPT_DIR}/resources/new_genome.fa.* \

diff --git a/variant_to_realignment.nf b/variant_to_realignment.nf
@@ -1,6 +1,5 @@
 #!/usr/bin/env nextflow
 
-
 // Enable syntax extension
 // See https://www.nextflow.io/docs/latest/dsl2.html
 nextflow.enable.dsl=2
@@ -11,6 +10,7 @@ nextflow.enable.dsl=2
  * "strand" column.
  */
 process convertVCFToBed {
+    label 'default_time', 'med_mem'
 
     input:
         path "source.vcf"
@@ -38,6 +38,7 @@ process convertVCFToBed {
  * Based on variants BED, generate the BED file for each flank.
  */
 process flankingRegionBed {
+    label 'default_time', 'med_mem'
 
     input:
         path "variants.bed"
@@ -67,8 +68,7 @@ process flankingRegionBed {
  * Extract the actual flanking region in fasta format.
  */
 process flankingRegionFasta {
-
-    memory '4 GB'
+    label 'default_time', 'med_mem'
 
     input:  
         path "flanking_r1.bed"
@@ -91,8 +91,7 @@ process flankingRegionFasta {
  * Extract information about the original variants and put it in the fasta header
  */
 process extractVariantInfoToFastaHeader {
-
-    memory '6GB'
+    label 'default_time', 'med_mem'
 
     input:  
         path "flanking_r1.bed"
@@ -127,6 +126,7 @@ process extractVariantInfoToFastaHeader {
  * Split fasta entries into multiple chunks
  */
 process split_fasta {
+    label 'short_time', 'small_mem'
 
     input:
         path interleaved_fasta
@@ -150,13 +150,11 @@ process split_fasta {
  * Align sequence with minimap2
  */
 process alignWithMinimap {
+    label 'med_time'
 
-    // Memory required is 5 times the size of the fasta in Bytes or at least 1GB
-    // Retry on kill (exit status 130) with twice the amount of memory
-    memory { Math.max(file(params.newgenome).size() * 10, 2000000000) * task.attempt + ' B' }
-
-    errorStrategy { task.exitStatus == 130 ? 'retry' : 'terminate' }
-    maxRetries 3
+    // Memory required is 10 times the size of the fasta in Bytes or at least 2GB
+    // Overwrite base_memory so that the standard retry strategy is used
+    ext base_memory: { Math.max(file(params.newgenome).size() * 10, 2000000000) }
 
     input:
         // reads contains paired interleaved (first and second read in the same file)
@@ -168,7 +166,6 @@ process alignWithMinimap {
     output:
         path "reads_aligned.bam", emit: reads_aligned_bam
 
-
     script:
     if (flanklength < 500)
         """
@@ -199,6 +196,7 @@ process alignWithMinimap {
  * Sort BAM file by name
  */
 process sortByName {
+    label 'default_time', 'med_mem'
 
     input:
         path "reads_aligned.bam"
@@ -215,9 +213,11 @@ process sortByName {
  * Align sequence with bowtie2
  */
 process alignWithBowtie {
+    label 'med_time'
 
     // Memory required is 5 times the size of the fasta in Bytes or at least 1GB
-    memory Math.max(file(params.newgenome).size() * 5, 1073741824) + ' B'
+    // Overwrite base_memory so that the standard retry strategy is used
+    ext base_memory: { Math.max(file(params.newgenome).size() * 5, 1073741824) }
 
     input:
         path "variant_read1.fa"
@@ -242,6 +242,7 @@ process alignWithBowtie {
  * Take the reads and process them to get the remapped variants
  */
 process readsToRemappedVariants {
+    label 'default_time', 'med_mem'
 
     input:
         path "reads.bam"
@@ -276,6 +277,8 @@ process readsToRemappedVariants {
  *
  */
 process merge_variants {
+    label 'short_time', 'small_mem'
+
     input:
         path "remapped*.vcf"
         path "unmapped*.vcf"