Skip to content

Commit

Permalink
Merge pull request #40 from apriltuesday/EVA-3330
Browse files Browse the repository at this point in the history
EVA-3330: Add labels to nextflow for SLURM migration
  • Loading branch information
apriltuesday authored Jun 14, 2024
2 parents a0eb79c + fe250f7 commit 2091345
Show file tree
Hide file tree
Showing 10 changed files with 99 additions and 25 deletions.
4 changes: 3 additions & 1 deletion .github/workflows/variant_remapping.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: [3.7]
python-version: [3.8]

steps:
- uses: actions/checkout@v2
Expand All @@ -29,6 +29,8 @@ jobs:
echo "/tmp/nextflow" >> $GITHUB_PATH
cd -
# $CONDA is an environment variable pointing to the root of the miniconda directory
$CONDA/bin/conda update conda
$CONDA/bin/conda install -y python=${{ matrix.python-version }}
$CONDA/bin/conda env update -q --file conda.yml --name base
$CONDA/bin/conda run pip install -q -r requirements.txt
Expand Down
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -64,3 +64,11 @@ Other files are created alongside the main output:
- `<output>_nra_variants.vcf` variants successfully remap that landed in a position where the reference allele changed. The output contains the original variant and the original reference allele as alternate.
- `<output>_unmapped.vcf` original variant that could not be successfully remap
- `<output>_count.yml` YAML file containing counts associated with each round of remapping

## Configuration

The pipeline relies on Nextflow configuration to set memory and runtime requirements. This is not required for all users, but it is recommended particularly for HPC and cloud environments.

There is an [example config](tests/resources/nextflow.config) used for tests that you can modify for your own needs. The main features are the use of labels to group processes into different categories based on their resource needs (small/medium/large), and the use of `base_memory` and `base_time` variables that some processes use to fine-tune their requirements.

For more about Nextflow configuration, see the [documentation](https://www.nextflow.io/docs/latest/config.html).
2 changes: 1 addition & 1 deletion conda.yml
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
name: variant-remapping
channels:
- defaults
- conda-forge
- bioconda
- defaults
dependencies:
- bedtools
- minimap2
Expand Down
16 changes: 15 additions & 1 deletion main.nf
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
#!/usr/bin/env nextflow


// Enable syntax extension
// See https://www.nextflow.io/docs/latest/dsl2.html
nextflow.enable.dsl=2
Expand Down Expand Up @@ -46,6 +45,7 @@ outfile_dir = file(params.outfile).getParent()
* Uncompress VCF file
*/
process uncompressInputVCF {
label 'short_time', 'med_mem'

input:
path "source.vcf"
Expand All @@ -69,6 +69,7 @@ process uncompressInputVCF {
* filter VCF file to remove variant too close the edges of chromosome because we can't get flanking regions
*/
process filterInputVCF {
label 'default_time', 'med_mem'

input:
path "source.vcf"
Expand All @@ -94,6 +95,7 @@ process filterInputVCF {
* Store the original VCF header for later use
*/
process storeVCFHeader {
label 'short_time', 'small_mem'

input:
path "source.vcf"
Expand All @@ -114,6 +116,7 @@ include { process_split_reads; process_split_reads_mid; process_split_reads_long
* This process convert the original Header to the remapped header and concatenate it with the remapped VCF records
*/
process generateRemappedVCF {
label 'short_time', 'small_mem'

input:
path "vcf_header.txt"
Expand Down Expand Up @@ -148,6 +151,7 @@ process generateRemappedVCF {
* This process adds the original header to unmapped variant VCF records and output the results
*/
process generateUnmappedVCF {
label 'short_time', 'small_mem'

publishDir outfile_dir,
overwrite: true,
Expand All @@ -170,6 +174,7 @@ process generateUnmappedVCF {
* Sort VCF file
*/
process sortVCF {
label 'default_time', 'med_mem'

input:
path "variants_remapped.vcf"
Expand All @@ -187,6 +192,7 @@ process sortVCF {
* Run bcftools norm to swap the REF and ALT alleles if the REF doesn't match the new assembly
*/
process normalise {
label 'default_time', 'med_mem'

input:
path "variants_remapped_sorted.vcf.gz"
Expand All @@ -202,6 +208,7 @@ process normalise {


process collectNovelReferenceAlleles {
label 'short_time', 'small_mem'

publishDir outfile_dir,
overwrite: true,
Expand All @@ -224,6 +231,7 @@ process collectNovelReferenceAlleles {
* Create file containing remapping stats
*/
process outputStats {
label 'short_time', 'small_mem'

publishDir outfile_dir,
overwrite: true,
Expand All @@ -244,6 +252,8 @@ process outputStats {
* Concatenate the unmapped variants
*/
process combineUnmappedVCF {
label 'short_time', 'small_mem'

input:
path "variants1.vcf"
path "variants2.vcf"
Expand All @@ -258,6 +268,8 @@ process combineUnmappedVCF {


process combineVCF {
label 'short_time', 'small_mem'

input:
path "variants1.vcf"
path "variants2.vcf"
Expand All @@ -271,6 +283,8 @@ process combineVCF {
}

process combineYaml {
label 'short_time', 'small_mem'

input:
path "initial_yml"
path "round1.yml"
Expand Down
7 changes: 6 additions & 1 deletion prepare_genome.nf
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,11 @@ nextflow.enable.dsl=2
* Index the new reference genome using bowtie_build
*/
process bowtieGenomeIndex {
label 'med_time'

// Memory required is 10 times the size of the fasta in Bytes or at least 1GB
memory Math.max(file(params.newgenome).size() * 10, 1073741824) + ' B'
// Overwrite base_memory so that the standard retry strategy is used
ext base_memory: { Math.max(file(params.newgenome).size() * 10, 1073741824) }

input:
path "genome_fasta"
Expand All @@ -25,6 +28,7 @@ process bowtieGenomeIndex {


process samtoolsFaidx {
label 'med_time', 'med_mem'

input:
path "genome_basename"
Expand All @@ -41,6 +45,7 @@ process samtoolsFaidx {
* Extract chomosome/contig sizes
*/
process chromSizes {
label 'short_time', 'small_mem'

input:
path "genome.fa.fai"
Expand Down
5 changes: 0 additions & 5 deletions tests/resources/config.yml

This file was deleted.

46 changes: 46 additions & 0 deletions tests/resources/nextflow.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@

executor {
name = 'local'
}

process.ext.base_memory = 6.GB
process.ext.base_time = 10.minutes

process {
executor = 'local'

// Dynamic resource allocation with retries
errorStrategy = 'retry'
maxRetries = 1
memory = { task.ext.base_memory * task.attempt }
time = { task.ext.base_time * task.attempt }

// Labels for specific runtimes
withLabel: short_time {
ext.base_time = 5.minutes
}
withLabel: default_time {
ext.base_time = 10.minutes
}
withLabel: med_time {
ext.base_time = 30.minutes
}
withLabel: long_time {
ext.base_time = 1.hour
}

// Labels for specific memory usage
withLabel: small_mem {
ext.base_memory = 1.GB
}
withLabel: default_mem {
ext.base_memory = 6.GB
}
withLabel: med_mem {
ext.base_memory = 8.GB
}
withLabel: big_mem {
ext.base_memory = 10.GB
}

}
2 changes: 1 addition & 1 deletion tests/test_pipeline.sh
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ chr1 3710 . T A 50 PASS . GT:GQ 1/1:0
EOT

nextflow run ${SOURCE_DIR}/main.nf \
-config ${SCRIPT_DIR}/resources/config.yml \
-config ${SCRIPT_DIR}/resources/nextflow.config \
--oldgenome ${SCRIPT_DIR}/resources/genome.fa \
--newgenome ${SCRIPT_DIR}/resources/new_genome.fa \
--vcffile ${SCRIPT_DIR}/resources/source.vcf \
Expand Down
5 changes: 3 additions & 2 deletions tests/test_pipeline_empty.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,11 @@ cat << EOT > "${SCRIPT_DIR}/resources/source_empty.vcf"
##INFO=<ID=COMMENT,Number=1,Type=String,Description="Comment">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Consensus Genotype across all datasets with called genotype">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HG001
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HG001
EOT

nextflow run ${SOURCE_DIR}/main.nf \
-config ${SCRIPT_DIR}/resources/config.yml \
-config ${SCRIPT_DIR}/resources/nextflow.config \
--oldgenome ${SCRIPT_DIR}/resources/genome.fa \
--newgenome ${SCRIPT_DIR}/resources/new_genome.fa \
--vcffile ${SCRIPT_DIR}/resources/source_empty.vcf \
Expand Down Expand Up @@ -52,6 +52,7 @@ rm -rf work .nextflow* \
${SCRIPT_DIR}/resources/source_empty.vcf \
${SCRIPT_DIR}/resources/expected_remap.vcf \
${SCRIPT_DIR}/resources/remap_empty.vcf \
${SCRIPT_DIR}/resources/remap_empty_nra_variants.vcf \
${SCRIPT_DIR}/resources/remap_empty_counts.yml \
${SCRIPT_DIR}/resources/remap_empty_unmapped.vcf \
${SCRIPT_DIR}/resources/new_genome.fa.* \
Expand Down
29 changes: 16 additions & 13 deletions variant_to_realignment.nf
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
#!/usr/bin/env nextflow


// Enable syntax extension
// See https://www.nextflow.io/docs/latest/dsl2.html
nextflow.enable.dsl=2
Expand All @@ -11,6 +10,7 @@ nextflow.enable.dsl=2
* "strand" column.
*/
process convertVCFToBed {
label 'default_time', 'med_mem'

input:
path "source.vcf"
Expand Down Expand Up @@ -38,6 +38,7 @@ process convertVCFToBed {
* Based on variants BED, generate the BED file for each flank.
*/
process flankingRegionBed {
label 'default_time', 'med_mem'

input:
path "variants.bed"
Expand Down Expand Up @@ -67,8 +68,7 @@ process flankingRegionBed {
* Extract the actual flanking region in fasta format.
*/
process flankingRegionFasta {

memory '4 GB'
label 'default_time', 'med_mem'

input:
path "flanking_r1.bed"
Expand All @@ -91,8 +91,7 @@ process flankingRegionFasta {
* Extract information about the original variants and put it in the fasta header
*/
process extractVariantInfoToFastaHeader {

memory '6GB'
label 'default_time', 'med_mem'

input:
path "flanking_r1.bed"
Expand Down Expand Up @@ -127,6 +126,7 @@ process extractVariantInfoToFastaHeader {
* Split fasta entries into multiple chunks
*/
process split_fasta {
label 'short_time', 'small_mem'

input:
path interleaved_fasta
Expand All @@ -150,13 +150,11 @@ process split_fasta {
* Align sequence with minimap2
*/
process alignWithMinimap {
label 'med_time'

// Memory required is 5 times the size of the fasta in Bytes or at least 1GB
// Retry on kill (exit status 130) with twice the amount of memory
memory { Math.max(file(params.newgenome).size() * 10, 2000000000) * task.attempt + ' B' }

errorStrategy { task.exitStatus == 130 ? 'retry' : 'terminate' }
maxRetries 3
// Memory required is 10 times the size of the fasta in Bytes or at least 2GB
// Overwrite base_memory so that the standard retry strategy is used
ext base_memory: { Math.max(file(params.newgenome).size() * 10, 2000000000) }

input:
// reads contains paired interleaved (first and second read in the same file)
Expand All @@ -168,7 +166,6 @@ process alignWithMinimap {
output:
path "reads_aligned.bam", emit: reads_aligned_bam


script:
if (flanklength < 500)
"""
Expand Down Expand Up @@ -199,6 +196,7 @@ process alignWithMinimap {
* Sort BAM file by name
*/
process sortByName {
label 'default_time', 'med_mem'

input:
path "reads_aligned.bam"
Expand All @@ -215,9 +213,11 @@ process sortByName {
* Align sequence with bowtie2
*/
process alignWithBowtie {
label 'med_time'

// Memory required is 5 times the size of the fasta in Bytes or at least 1GB
memory Math.max(file(params.newgenome).size() * 5, 1073741824) + ' B'
// Overwrite base_memory so that the standard retry strategy is used
ext base_memory: { Math.max(file(params.newgenome).size() * 5, 1073741824) }

input:
path "variant_read1.fa"
Expand All @@ -242,6 +242,7 @@ process alignWithBowtie {
* Take the reads and process them to get the remapped variants
*/
process readsToRemappedVariants {
label 'default_time', 'med_mem'

input:
path "reads.bam"
Expand Down Expand Up @@ -276,6 +277,8 @@ process readsToRemappedVariants {
*
*/
process merge_variants {
label 'short_time', 'small_mem'

input:
path "remapped*.vcf"
path "unmapped*.vcf"
Expand Down

0 comments on commit 2091345

Please sign in to comment.