diff --git a/main.nf b/main.nf index 6fba958..2f57584 100644 --- a/main.nf +++ b/main.nf @@ -164,25 +164,10 @@ workflow { .fromFilePairs(params.annotations, size: 1) .ifEmpty { error "Cannot find any annotation matching: ${params.annotations}" } - extraexons = params.extraexons ? - Channel.fromFilePairs(params.extraexons, checkIfExists: true, size: 1) - .ifEmpty { error "Extra exons not found" } : - Channel.empty() - - // We join channels. If no extraexons, then it's empty, so no problem - data_to_annotation_raw = genomes.join(annotations) - data_to_annotation = data_to_annotation_raw.join(extraexons, remainder: true) - - evodists_ch = Channel.fromPath(params.evodists, checkIfExists: true).collect() clusterfile_ch = Channel.fromPath(params.cluster, checkIfExists: true).collect() - if ( params.orthopairs ) { - orthopairs_ch = Channel.fromPath(params.orthopairs, checkIfExists: true).collect() - } else { - orthopairs_ch = Channel.fromPath("/path/to/NO_FILE").collect() - } PREPARE( - evodists_ch, + params.evodists, clusterfile_ch, gtfs, fastas, @@ -191,8 +176,10 @@ workflow { params.long_dist, params.medium_dist, params.short_dist, - data_to_annotation, - params.extraexons + genomes, + annotations, + params.extraexons, + params.alignmentnum ) ALIGN( @@ -202,10 +189,27 @@ workflow { params.long_dist, params.medium_dist, params.short_dist, + params.alignmentnum, + params.prevaln + ) + + SCORE( + ALIGN.out.folder_jscores, + PREPARE.out.clusters_split_ch, + PREPARE.out.dist_ranges_ch, + params.bonafide_pairs, + params.long_dist, + params.medium_dist, + params.short_dist ) - SCORE(ALIGN.out.folder_jscores, PREPARE.out.clusters_split_ch, PREPARE.out.dist_ranges_ch, params.bonafide_pairs) - CLUSTER(SCORE.out.score_exon_hits_pairs, PREPARE.out.clusters_split_ch, clusterfile_ch, orthopairs_ch) + CLUSTER( + SCORE.out.score_exon_hits_pairs, + PREPARE.out.clusters_split_ch, + clusterfile_ch, + params.orthopairs, + params.orthogroupnum + ) } } diff --git a/modules/local/exorthist/align_pairs.nf b/modules/local/exorthist/align_pairs.nf index 007a2a4..8b39525 100644 --- a/modules/local/exorthist/align_pairs.nf +++ b/modules/local/exorthist/align_pairs.nf @@ -8,13 +8,14 @@ process PARSE_IPA_PROT_ALN { val long_dist val medium_dist val short_dist + path prevaln output: tuple val("${sp1.name}-${sp2.name}"), path("${sp1.name}-${sp2.name}-*"), emit: aligned_subclusters_4_splitting path "${sp1.name}-${sp2.name}_EXs_to_split_part_*.txt", emit: EXs_to_split script: - def prev_alignments = params.prevaln ? params.prevaln : "" + def prev_alignments = prevaln.name != 'NO_FILE' ? "${prevaln}" : '' def cls_parts = cls_part_file.name.split("_") def dist_range_par diff --git a/modules/local/exorthist/filter_matches.nf b/modules/local/exorthist/filter_matches.nf index e6340ce..00dfc1e 100644 --- a/modules/local/exorthist/filter_matches.nf +++ b/modules/local/exorthist/filter_matches.nf @@ -7,6 +7,9 @@ process FILTER_AND_SELECT_BEST_EX_MATCHES_BY_TARGETGENE { input: tuple val(comp_id), path(all_scores), val(dist_range) + val(long_dist) + val(medium_dist) + val(short_dist) output: path "*.tab", emit: filterscore_per_joining @@ -18,13 +21,13 @@ process FILTER_AND_SELECT_BEST_EX_MATCHES_BY_TARGETGENE { switch(dist_range) { case "long": - dist_range_par = params.long_dist.split(",") + dist_range_par = long_dist.split(",") break case "medium": - dist_range_par = params.medium_dist.split(",") + dist_range_par = medium_dist.split(",") break case "short": - dist_range_par = params.short_dist.split(",") + dist_range_par = short_dist.split(",") break } diff --git a/modules/local/exorthist/format_input.nf b/modules/local/exorthist/format_input.nf index 07ce575..90f11c0 100644 --- a/modules/local/exorthist/format_input.nf +++ b/modules/local/exorthist/format_input.nf @@ -2,6 +2,7 @@ process FORMAT_EX_CLUSTERS_INPUT { input: path score_exon_hits_pairs path clusterfile + val(orthogroupnum) output: path "PART_*-cluster_input.tab", emit: cluster_parts @@ -10,10 +11,10 @@ process FORMAT_EX_CLUSTERS_INPUT { """ if [[ "${clusterfile}" == *.gz ]]; then zcat ${clusterfile} > cluster_file - D1_format_EX_clusters_input.pl cluster_file ${score_exon_hits_pairs} ${params.orthogroupnum} + D1_format_EX_clusters_input.pl cluster_file ${score_exon_hits_pairs} ${orthogroupnum} rm cluster_file else - D1_format_EX_clusters_input.pl ${clusterfile} ${score_exon_hits_pairs} ${params.orthogroupnum} + D1_format_EX_clusters_input.pl ${clusterfile} ${score_exon_hits_pairs} ${orthogroupnum} fi """ } diff --git a/modules/local/exorthist/split_clusters_chunks.nf b/modules/local/exorthist/split_clusters_chunks.nf index bacc8aa..7e612ca 100644 --- a/modules/local/exorthist/split_clusters_chunks.nf +++ b/modules/local/exorthist/split_clusters_chunks.nf @@ -4,6 +4,7 @@ process SPLIT_CLUSTERS_IN_CHUNKS { input: path cls_tab_files tuple val(id_comb), path(idfolder_A), path(idfolder_B) + val(alignmentnum) output: tuple path(idfolder_A), path(idfolder_B), path("${idfolder_A}_${idfolder_B}/*.cls.tab-part_*"), emit: cls_files_2_align @@ -15,7 +16,7 @@ process SPLIT_CLUSTERS_IN_CHUNKS { --sp2 ${idfolder_B} \ --expath ./ \ --project_dir ./ \ - --N_split ${params.alignmentnum} \ + --N_split ${alignmentnum} \ --gene_cluster ${id_comb}.cls.tab """ } diff --git a/modules/local/exorthist/split_pairs.nf b/modules/local/exorthist/split_pairs.nf index 396ae58..6027857 100644 --- a/modules/local/exorthist/split_pairs.nf +++ b/modules/local/exorthist/split_pairs.nf @@ -2,6 +2,7 @@ process SPLIT_EX_PAIRS_TO_REALIGN { label 'pandas' input: path '*' + val(alignmentnum) output: path '*EXs_to_realign_part_*', emit: EXs_to_realign_batches @@ -9,7 +10,7 @@ process SPLIT_EX_PAIRS_TO_REALIGN { script: """ for file in \$(ls *); do - B2_split_EX_pairs_to_realign.py -i \${file} -n ${params.alignmentnum} + B2_split_EX_pairs_to_realign.py -i \${file} -n ${alignmentnum} done """ } diff --git a/subworkflows/local/exorthist/align.nf b/subworkflows/local/exorthist/align.nf index da1f067..08361d7 100644 --- a/subworkflows/local/exorthist/align.nf +++ b/subworkflows/local/exorthist/align.nf @@ -14,17 +14,25 @@ workflow ALIGN { long_dist medium_dist short_dist + alignmentnum + prevaln main: + if (prevaln) { + prevaln_ch = Channel.fromPath(prevaln, type: 'dir', checkIfExists: true).collect() + } else { + prevaln_ch = Channel.fromPath("/path/to/NO_FILE").collect() + } + // the last argument is the protein similarity alignment. // if a prevaln folder is provided, the protein alignments present in each species pair subfolder will not be repeated. - PARSE_IPA_PROT_ALN(blosumfile, alignment_input, long_dist, medium_dist, short_dist) + PARSE_IPA_PROT_ALN(blosumfile, alignment_input, long_dist, medium_dist, short_dist, prevaln_ch) // Collapse EXs_to_split in batches of 500 files EXs_to_split = PARSE_IPA_PROT_ALN.out.EXs_to_split EXs_to_split_batches = EXs_to_split.toSortedList().flatten().buffer(size : 500, remainder: true) // Split exons pairs to realign - SPLIT_EX_PAIRS_TO_REALIGN(EXs_to_split_batches) + SPLIT_EX_PAIRS_TO_REALIGN(EXs_to_split_batches, alignmentnum) EXs_to_realign_batches = SPLIT_EX_PAIRS_TO_REALIGN.out.EXs_to_realign_batches // Flatten the results from the previous batch run and combine with sp1 and sp2 information, using sp1-sp2 as key. EXs_to_realign = EXs_to_realign_batches.flatten().map{[it.getName().toString().split("_")[0],it]}.groupTuple().join(clusters_split_ch).transpose() diff --git a/subworkflows/local/exorthist/cluster.nf b/subworkflows/local/exorthist/cluster.nf index 7158edf..2f3019f 100644 --- a/subworkflows/local/exorthist/cluster.nf +++ b/subworkflows/local/exorthist/cluster.nf @@ -12,11 +12,17 @@ workflow CLUSTER { score_exon_hits_pairs clusters_split_ch clusterfile_ch - orthopairs_ch + orthopairs + orthogroupnum main: + if (orthopairs) { + orthopairs_ch = Channel.fromPath(orthopairs, checkIfExists: true).collect() + } else { + orthopairs_ch = Channel.fromPath("/path/to/NO_FILE").collect() + } - FORMAT_EX_CLUSTERS_INPUT(score_exon_hits_pairs, clusterfile_ch) + FORMAT_EX_CLUSTERS_INPUT(score_exon_hits_pairs, clusterfile_ch, orthogroupnum) // Split the file of exon pairs // Unclustered are the exons ending up in single-exon clusters diff --git a/subworkflows/local/exorthist/prepare.nf b/subworkflows/local/exorthist/prepare.nf index 5c1e202..625ed15 100644 --- a/subworkflows/local/exorthist/prepare.nf +++ b/subworkflows/local/exorthist/prepare.nf @@ -8,7 +8,7 @@ include { SPLIT_CLUSTERS_BY_SPECIES_PAIRS } from "${LOCAL_MODULES}/split_cluster workflow PREPARE { take: - evodists_ch + evodists clusterfile_ch gtfs fastas @@ -17,24 +17,13 @@ workflow PREPARE { long_dist medium_dist short_dist - data_to_annotation + genomes + annotations extraexons + alignmentnum main: - // Print contents of each channel - // gtfs.view { "GTF file: $it" } - // fastas.view { "FASTA file: $it" } - // gtfs_suffix.view { "GTF suffix: $it" } - // fastas_suffix.view { "FASTA suffix: $it" } - // data_to_annotation.view { "Data to annotation: $it" } - - extraexons_ch = params.extraexons ? - Channel.fromFilePairs(params.extraexons, checkIfExists: true, size: 1) - .ifEmpty { error "Extra exons not found" } : - Channel.empty() - - CHECK_INPUT( evodists_ch, clusterfile_ch, @@ -47,11 +36,21 @@ workflow PREPARE { short_dist ) - // Sic: https://nextflow-io.github.io/patterns/optional-input/ - if ( extraexons ) { - GENERATE_ANNOTATIONS(data_to_annotation, extraexons_ch) + evodists_ch = Channel.fromPath(evodists, checkIfExists: true).collect() + extraexons_ch = extraexons ? + Channel.fromFilePairs(extraexons, checkIfExists: true, size: 1) + .ifEmpty { error "Extra exons not found" } : + Channel.empty() + + // We join channels. If no extraexons, then it's empty, so no problem + data_to_annotation_raw = genomes.join(annotations) + data_to_annotation = data_to_annotation_raw.join(extraexons_ch, remainder: true) + + if (extraexons) { + GENERATE_ANNOTATIONS(data_to_annotation, extraexons_ch) } else { - GENERATE_ANNOTATIONS(data_to_annotation, Channel.fromPath("/path/to/NO_FILE").collect()) + // Sic: https://nextflow-io.github.io/patterns/optional-input/ + GENERATE_ANNOTATIONS(data_to_annotation, Channel.fromPath("/path/to/NO_FILE").collect()) } clusters_split_ch = GENERATE_ANNOTATIONS.out.idfolders.toList().map{ [it, it].combinations().findAll{ a, b -> a[0] < b[0]} } @@ -63,7 +62,7 @@ workflow PREPARE { // Split clusters cls_tab_files_ch = SPLIT_CLUSTERS_BY_SPECIES_PAIRS.out.cls_tab_files - SPLIT_CLUSTERS_IN_CHUNKS(cls_tab_files_ch.collect(), clusters_split_ch) + SPLIT_CLUSTERS_IN_CHUNKS(cls_tab_files_ch.collect(), clusters_split_ch, alignmentnum) cls_files_2_align = SPLIT_CLUSTERS_IN_CHUNKS.out.cls_files_2_align cls_files_2_align_t = cls_files_2_align.transpose().map{[it[0].getFileName().toString()+"-"+it[1].getFileName().toString(), it[0], it[1], it[2]]} diff --git a/subworkflows/local/exorthist/score.nf b/subworkflows/local/exorthist/score.nf index 74bd639..df15f9b 100644 --- a/subworkflows/local/exorthist/score.nf +++ b/subworkflows/local/exorthist/score.nf @@ -12,6 +12,9 @@ workflow SCORE { clusters_split_ch dist_ranges_ch bonafide_pairs + long_dist + medium_dist + short_dist main: @@ -20,7 +23,7 @@ workflow SCORE { SCORE_EX_MATCHES(data_to_score) // Filter the best matches above score cutoffs by target gene. all_scores_to_filt_ch = SCORE_EX_MATCHES.out.all_scores_to_filt - FILTER_AND_SELECT_BEST_EX_MATCHES_BY_TARGETGENE(all_scores_to_filt_ch.join(dist_ranges_ch)) + FILTER_AND_SELECT_BEST_EX_MATCHES_BY_TARGETGENE(all_scores_to_filt_ch.join(dist_ranges_ch), long_dist, medium_dist, short_dist) // Join filtered scored EX matches filterscore_per_joining_ch = FILTER_AND_SELECT_BEST_EX_MATCHES_BY_TARGETGENE.out.filterscore_per_joining JOIN_FILTERED_EX_MATCHES(filterscore_per_joining_ch.collect())