diff --git a/assets/NO_FILE b/assets/NO_FILE new file mode 100644 index 00000000..e69de29b diff --git a/assets/proteinfold_template.html b/assets/proteinfold_template.html new file mode 100644 index 00000000..67affbc1 --- /dev/null +++ b/assets/proteinfold_template.html @@ -0,0 +1,944 @@ + + + + + + + Protein structure prediction + + + + + + + + + + + + + + + + + +
+ +
+ +
+ + + +
+ +
+ +
+
+
+
+
+
<50
+
70
+
90+
+
+
+
+
+

+ Alphafold produces a + + per-residue confidence score (pLDDT) + + between 0 and 100. Some regions below 50 pLDDT may be unstructured in isolation. +

+
+
+ + + + +
+ +
+ +
+ +
+
+
Information
+ +
+
+
Program: *prog_name*
+
ID: *sample_name*
+
+
Average:
+
+
+
Navigation
+ +
+
+ Scroll up/down + to zoom in and out +
+
+ Click + drag + to rotate the structure +
+
+ CTRL + click + drag + to move the structure +
+
+ Click + an atom to bring it into focus +
+
+
+
+ +
+
+
Toggle representations
+
+ + + + +
+
+
+
+
+
Actions
+
+ + +
+
+
+
Download
+
+ + +
+
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+
+
Sequence Coverage
+
+
+ +
+ +
+
+
+
pLDDT
+
+ +
+
+
+
+
+ +
+ +
+
+
+ + + +
+
+

+ The Australian BioCommons + is supported by + Bioplatforms Australia +

+

+ Bioplatforms Australia + is enabled by + NCRIS +

+
+
+
+ + + diff --git a/bin/extract_output.py b/bin/extract_output.py new file mode 100755 index 00000000..3b22f8e5 --- /dev/null +++ b/bin/extract_output.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python +import pickle +import os +import argparse + + +def read_pkl(id, pkl_files): + for pkl_file in pkl_files: + dict_data = pickle.load(open(pkl_file, "rb")) + if pkl_file.endswith("features.pkl"): + with open(f"{id}_msa.tsv", "w") as out_f: + for val in dict_data["msa"]: + out_f.write("\t".join([str(x) for x in val]) + "\n") + else: + model_id = ( + os.path.basename(pkl_file) + .replace("result_model_", "") + .replace("_pred_0.pkl", "") + ) + with open(f"{id}_lddt_{model_id}.tsv", "w") as out_f: + out_f.write("\t".join([str(x) for x in dict_data["plddt"]]) + "\n") + + +parser = argparse.ArgumentParser() +parser.add_argument("--pkls", dest="pkls", required=True, nargs="+") +parser.add_argument("--name", dest="name") +parser.add_argument("--output_dir", dest="output_dir") +parser.set_defaults(output_dir="") +parser.set_defaults(name="") +args = parser.parse_args() + +read_pkl(args.name, args.pkls) diff --git a/bin/generate_report.py b/bin/generate_report.py new file mode 100755 index 00000000..dd980ba4 --- /dev/null +++ b/bin/generate_report.py @@ -0,0 +1,368 @@ +#!/usr/bin/env python + +import os +import argparse +from matplotlib import pyplot as plt +from collections import OrderedDict +import base64 +import plotly.graph_objects as go +import re +from Bio import PDB + + +def generate_output_images(msa_path, plddt_data, name, out_dir, in_type, generate_tsv): + msa = [] + if in_type.lower() != "colabfold" and not msa_path.endswith("NO_FILE"): + with open(msa_path, "r") as in_file: + for line in in_file: + msa.append([int(x) for x in line.strip().split()]) + + seqid = [] + for sequence in msa: + matches = [ + 1.0 if first == other else 0.0 for first, other in zip(msa[0], sequence) + ] + seqid.append(sum(matches) / len(matches)) + + seqid_sort = sorted(range(len(seqid)), key=seqid.__getitem__) + + non_gaps = [] + for sequence in msa: + non_gaps.append( + [float(num != 21) if num != 21 else float("nan") for num in sequence] + ) + + sorted_non_gaps = [non_gaps[i] for i in seqid_sort] + final = [] + for sorted_seq, identity in zip( + sorted_non_gaps, [seqid[i] for i in seqid_sort] + ): + final.append( + [ + value * identity if not isinstance(value, str) else value + for value in sorted_seq + ] + ) + + # ################################################################## + plt.figure(figsize=(14, 14), dpi=100) + # ################################################################## + plt.title("Sequence coverage", fontsize=30, pad=36) + plt.imshow( + final, + interpolation="nearest", + aspect="auto", + cmap="rainbow_r", + vmin=0, + vmax=1, + origin="lower", + ) + + column_counts = [0] * len(msa[0]) + for col in range(len(msa[0])): + for row in msa: + if row[col] != 21: + column_counts[col] += 1 + + plt.plot(column_counts, color="black") + plt.xlim(-0.5, len(msa[0]) - 0.5) + plt.ylim(-0.5, len(msa) - 0.5) + + plt.tick_params(axis="both", which="both", labelsize=18) + + cbar = plt.colorbar() + cbar.set_label("Sequence identity to query", fontsize=24, labelpad=24) + cbar.ax.tick_params(labelsize=18) + plt.xlabel("Positions", fontsize=24, labelpad=24) + plt.ylabel("Sequences", fontsize=24, labelpad=36) + plt.savefig(f"{out_dir}/{name+('_' if name else '')}seq_coverage.png") + + # ################################################################## + + plddt_per_model = OrderedDict() + output_data = plddt_data + + if generate_tsv == "y": + for plddt_path in output_data: + with open(plddt_path, "r") as in_file: + plddt_per_model[os.path.basename(plddt_path)[:-4]] = [ + float(x) for x in in_file.read().strip().split() + ] + else: + for i, plddt_values_str in enumerate(output_data): + plddt_per_model[i] = [] + plddt_per_model[i] = [float(x) for x in plddt_values_str.strip().split()] + + fig = go.Figure() + for idx, (model_name, value_plddt) in enumerate(plddt_per_model.items()): + rank_label = f"Ranked {idx}" + fig.add_trace( + go.Scatter( + x=list(range(len(value_plddt))), + y=value_plddt, + mode="lines", + name=rank_label, + text=[f"({i}, {value:.2f})" for i, value in enumerate(value_plddt)], + hoverinfo="text", + ) + ) + fig.update_layout( + title=dict(text="Predicted LDDT per position", x=0.5, xanchor="center"), + xaxis=dict( + title="Positions", showline=True, linecolor="black", gridcolor="WhiteSmoke" + ), + yaxis=dict( + title="Predicted LDDT", + range=[0, 100], + minallowed=0, + maxallowed=100, + showline=True, + linecolor="black", + gridcolor="WhiteSmoke", + ), + legend=dict(yanchor="bottom", y=0, xanchor="right", x=1.3), + plot_bgcolor="white", + width=600, + height=600, + modebar_remove=["toImage", "zoomIn", "zoomOut"], + ) + html_content = fig.to_html( + full_html=False, + include_plotlyjs="cdn", + config={"displayModeBar": True, "displaylogo": False, "scrollZoom": True}, + ) + + with open( + f"{out_dir}/{name+('_' if name else '')}coverage_LDDT.html", "w" + ) as out_file: + out_file.write(html_content) + + +def generate_plots(msa_path, plddt_paths, name, out_dir): + msa = [] + with open(msa_path, "r") as in_file: + for line in in_file: + msa.append([int(x) for x in line.strip().split()]) + + seqid = [] + for sequence in msa: + matches = [ + 1.0 if first == other else 0.0 for first, other in zip(msa[0], sequence) + ] + seqid.append(sum(matches) / len(matches)) + + seqid_sort = sorted(range(len(seqid)), key=seqid.__getitem__) + + non_gaps = [] + for sequence in msa: + non_gaps.append( + [float(num != 21) if num != 21 else float("nan") for num in sequence] + ) + + sorted_non_gaps = [non_gaps[i] for i in seqid_sort] + final = [] + for sorted_seq, identity in zip(sorted_non_gaps, [seqid[i] for i in seqid_sort]): + final.append( + [ + value * identity if not isinstance(value, str) else value + for value in sorted_seq + ] + ) + + # Plotting Sequence Coverage using Plotly + fig = go.Figure() + fig.add_trace( + go.Heatmap( + z=final, + colorscale="Rainbow", + zmin=0, + zmax=1, + ) + ) + fig.update_layout( + title="Sequence coverage", xaxis_title="Positions", yaxis_title="Sequences" + ) + # Save as interactive HTML instead of an image + fig.savefig(f"{out_dir}/{name+('_' if name else '')}seq_coverage.png") + + # Plotting Predicted LDDT per position using Plotly + plddt_per_model = OrderedDict() + plddt_paths.sort() + for plddt_path in plddt_paths: + with open(plddt_path, "r") as in_file: + plddt_per_model[os.path.basename(plddt_path)[:-4]] = [ + float(x) for x in in_file.read().strip().split() + ] + + i = 0 + for model_name, value_plddt in plddt_per_model.items(): + fig = go.Figure() + fig.add_trace( + go.Scatter( + x=list(range(len(value_plddt))), + y=value_plddt, + mode="lines", + name=model_name, + ) + ) + fig.update_layout(title="Predicted LDDT per Position") + fig.savefig(f"{out_dir}/{name+('_' if name else '')}coverage_LDDT_{i}.png") + i += 1 + + +def align_structures(structures): + parser = PDB.PDBParser(QUIET=True) + structures = [ + parser.get_structure(f"Structure_{i}", pdb) for i, pdb in enumerate(structures) + ] + + ref_structure = structures[0] + ref_atoms = [atom for atom in ref_structure.get_atoms()] + + super_imposer = PDB.Superimposer() + aligned_structures = [structures[0]] # Include the reference structure in the list + + for i, structure in enumerate(structures[1:], start=1): + target_atoms = [atom for atom in structure.get_atoms()] + + super_imposer.set_atoms(ref_atoms, target_atoms) + super_imposer.apply(structure.get_atoms()) + + aligned_structure = f"aligned_structure_{i}.pdb" + io = PDB.PDBIO() + io.set_structure(structure) + io.save(aligned_structure) + aligned_structures.append(aligned_structure) + + return aligned_structures + + +def pdb_to_lddt(pdb_files, generate_tsv): + pdb_files_sorted = pdb_files + pdb_files_sorted.sort() + + output_lddt = [] + averages = [] + + for pdb_file in pdb_files_sorted: + plddt_values = [] + current_resd = [] + last = None + with open(pdb_file, "r") as infile: + for line in infile: + columns = line.split() + if len(columns) >= 11: + if last and last != columns[5]: + plddt_values.append(sum(current_resd) / len(current_resd)) + current_resd = [] + current_resd.append(float(columns[10])) + last = columns[5] + if len(current_resd) > 0: + plddt_values.append(sum(current_resd) / len(current_resd)) + + # Calculate the average PLDDT value for the current file + if plddt_values: + avg_plddt = sum(plddt_values) / len(plddt_values) + averages.append(round(avg_plddt, 3)) + else: + averages.append(0.0) + + if generate_tsv == "y": + output_file = f"{pdb_file.replace('.pdb', '')}_plddt.tsv" + with open(output_file, "w") as outfile: + outfile.write(" ".join(map(str, plddt_values)) + "\n") + output_lddt.append(output_file) + else: + plddt_values_string = " ".join(map(str, plddt_values)) + output_lddt.append(plddt_values_string) + + return output_lddt, averages + + +print("Starting...") + +version = "1.0.0" +parser = argparse.ArgumentParser() +parser.add_argument("--type", dest="in_type") +parser.add_argument( + "--generate_tsv", choices=["y", "n"], default="n", dest="generate_tsv" +) +parser.add_argument("--msa", dest="msa", default="NO_FILE") +parser.add_argument("--pdb", dest="pdb", required=True, nargs="+") +parser.add_argument("--name", dest="name") +parser.add_argument("--output_dir", dest="output_dir") +parser.add_argument("--html_template", dest="html_template") +parser.add_argument("--version", action="version", version=f"{version}") +parser.set_defaults(output_dir="") +parser.set_defaults(in_type="ESMFOLD") +parser.set_defaults(name="") +args = parser.parse_args() + +lddt_data, lddt_averages = pdb_to_lddt(args.pdb, args.generate_tsv) + +generate_output_images( + args.msa, lddt_data, args.name, args.output_dir, args.in_type, args.generate_tsv +) +# generate_plots(args.msa, args.plddt, args.name, args.output_dir) + +print("generating html report...") +structures = args.pdb +structures.sort() +aligned_structures = align_structures(structures) + +io = PDB.PDBIO() +ref_structure_path = "aligned_structure_0.pdb" +io.set_structure(aligned_structures[0]) +io.save(ref_structure_path) +aligned_structures[0] = ref_structure_path + +alphafold_template = open(args.html_template, "r").read() +alphafold_template = alphafold_template.replace("*sample_name*", args.name) +alphafold_template = alphafold_template.replace("*prog_name*", args.in_type) + +args_pdb_array_js = ",\n".join([f'"{model}"' for model in structures]) +alphafold_template = re.sub( + r"const MODELS = \[.*?\];", # Match the existing MODELS array in HTML template + f"const MODELS = [\n {args_pdb_array_js}\n];", # Replace with the new array + alphafold_template, + flags=re.DOTALL, +) + +averages_js_array = f"const LDDT_AVERAGES = {lddt_averages};" +alphafold_template = alphafold_template.replace( + "const LDDT_AVERAGES = [];", averages_js_array +) + +i = 0 +for structure in aligned_structures: + alphafold_template = alphafold_template.replace( + f"*_data_ranked_{i}.pdb*", open(structure, "r").read().replace("\n", "\\n") + ) + i += 1 + +if not args.msa.endswith("NO_FILE"): + image_path = ( + f"{args.output_dir}/{args.msa}" + if args.in_type.lower() == "colabfold" + else f"{args.output_dir}/{args.name + ('_' if args.name else '')}seq_coverage.png" + ) + with open(image_path, "rb") as in_file: + alphafold_template = alphafold_template.replace( + "seq_coverage.png", + f"data:image/png;base64,{base64.b64encode(in_file.read()).decode('utf-8')}", + ) +else: + pattern = r'
.*?(.*?)*?
\s*' + alphafold_template = re.sub(pattern, "", alphafold_template, flags=re.DOTALL) + +with open( + f"{args.output_dir}/{args.name + ('_' if args.name else '')}coverage_LDDT.html", + "r", +) as in_file: + lddt_html = in_file.read() + alphafold_template = alphafold_template.replace( + '
', lddt_html + ) + +with open(f"{args.output_dir}/{args.name}_{args.in_type.lower()}_report.html", "w") as out_file: + out_file.write(alphafold_template) diff --git a/main.nf b/main.nf index 6d762db1..e7e0778c 100644 --- a/main.nf +++ b/main.nf @@ -35,6 +35,7 @@ include { PIPELINE_COMPLETION } from './subworkflows/local/utils_nf include { getColabfoldAlphafold2Params } from './subworkflows/local/utils_nfcore_proteinfold_pipeline' include { getColabfoldAlphafold2ParamsPath } from './subworkflows/local/utils_nfcore_proteinfold_pipeline' +include { GENERATE_REPORT } from './modules/local/generate_report' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ COLABFOLD PARAMETER VALUES @@ -58,7 +59,7 @@ workflow NFCORE_PROTEINFOLD { main: ch_multiqc = Channel.empty() ch_versions = Channel.empty() - + ch_report_input = Channel.empty() // // WORKFLOW: Run alphafold2 // @@ -115,6 +116,10 @@ workflow NFCORE_PROTEINFOLD { ) ch_multiqc = ALPHAFOLD2.out.multiqc_report ch_versions = ch_versions.mix(ALPHAFOLD2.out.versions) + ch_report_input = ch_report_input.mix( + ALPHAFOLD2.out.pdb.join(ALPHAFOLD2.out.msa).map{it[0]["model"] = "ALPHAFOLD2"; it} + ) + } // @@ -150,6 +155,13 @@ workflow NFCORE_PROTEINFOLD { ) ch_multiqc = COLABFOLD.out.multiqc_report ch_versions = ch_versions.mix(COLABFOLD.out.versions) + ch_report_input = ch_report_input.mix( + COLABFOLD + .out + .pdb + .join(COLABFOLD.out.msa) + .map { it[0]["model"] = "colabfold"; it } + ) } // @@ -178,6 +190,21 @@ workflow NFCORE_PROTEINFOLD { ) ch_multiqc = ESMFOLD.out.multiqc_report ch_versions = ch_versions.mix(ESMFOLD.out.versions) + ch_report_input = ch_report_input.mix( + ESMFOLD.out.pdb.combine(Channel.fromPath("$projectDir/assets/NO_FILE")).map{it[0]["model"] = "ESMFOLD"; it} + ) + } + // + // POST PROCESSING: generate visulaisation reports + // + if (!params.skip_visualisation){ + GENERATE_REPORT( + ch_report_input.map{[it[0], it[1]]}, + ch_report_input.map{[it[0], it[2]]}, + ch_report_input.map{it[0].model}, + Channel.fromPath("$projectDir/assets/proteinfold_template.html").first() + ) + ch_versions = ch_versions.mix(GENERATE_REPORT.out.versions) } emit: multiqc_report = ch_multiqc // channel: /path/to/multiqc_report.html diff --git a/modules/local/colabfold_batch.nf b/modules/local/colabfold_batch.nf index 5b1c5467..28e69e04 100644 --- a/modules/local/colabfold_batch.nf +++ b/modules/local/colabfold_batch.nf @@ -18,9 +18,10 @@ process COLABFOLD_BATCH { val numRec output: - path ("*") , emit: pdb - path ("*_mqc.png") , emit: multiqc - path "versions.yml", emit: versions + tuple val(meta), path ("*_relaxed_rank_*.pdb"), emit: pdb + tuple val(meta), path ("*_coverage.png") , emit: msa + tuple val(meta), path ("*_mqc.png") , emit: multiqc + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when @@ -52,6 +53,11 @@ process COLABFOLD_BATCH { """ touch ./"${fasta.baseName}"_colabfold.pdb touch ./"${fasta.baseName}"_mqc.png + touch ./${fasta.baseName}_relaxed_rank_01.pdb + touch ./${fasta.baseName}_relaxed_rank_02.pdb + touch ./${fasta.baseName}_relaxed_rank_03.pdb + touch ./${fasta.baseName}_coverage.png + touch ./${fasta.baseName}_scores_rank.json cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/generate_report.nf b/modules/local/generate_report.nf new file mode 100644 index 00000000..45380af3 --- /dev/null +++ b/modules/local/generate_report.nf @@ -0,0 +1,52 @@ +process GENERATE_REPORT { + tag "$meta.id-$meta.model" + label 'process_single' + + conda "bioconda::multiqc:1.21" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'oras://community.wave.seqera.io/library/pip_biopython_matplotlib_plotly:e865101a15ad0014' : + 'community.wave.seqera.io/library/pip_biopython_matplotlib_plotly:4d51afeb4bb75495' }" + + input: + tuple val(meta), path(pdb) + tuple val(meta_msa), path(msa) + val(output_type) + path(template) + + output: + tuple val(meta), path ("*report.html"), emit: report + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + + """ + generate_report.py --type ${output_type} \\ + --msa ${msa} \\ + --pdb ${pdb.join(' ')} \\ + --html_template ${template} \\ + --output_dir ./ \\ + --name ${meta.id} \\ + $args \\ + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python3 --version | sed 's/Python //g') + generate_report.py: \$(python3 --version) + END_VERSIONS + """ + + stub: + """ + touch test_alphafold2_report.html + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python3 --version | sed 's/Python //g') + generate_report.py: \$(python3 --version) + END_VERSIONS + """ +} diff --git a/modules/local/run_alphafold2.nf b/modules/local/run_alphafold2.nf index 20cbf9fc..aae13bf7 100644 --- a/modules/local/run_alphafold2.nf +++ b/modules/local/run_alphafold2.nf @@ -29,7 +29,9 @@ process RUN_ALPHAFOLD2 { output: path ("${fasta.baseName}*") - path "*_mqc.tsv", emit: multiqc + tuple val(meta), path ("${fasta.baseName}/ranked*pdb"), emit: pdb + tuple val(meta), path ("${fasta.baseName}/*_msa.tsv") , emit: msa + tuple val(meta), path ("*_mqc.tsv") , emit: multiqc path "versions.yml", emit: versions when: @@ -72,6 +74,9 @@ process RUN_ALPHAFOLD2 { paste ranked_0_plddt.tsv ranked_1_plddt.tsv ranked_2_plddt.tsv ranked_3_plddt.tsv ranked_4_plddt.tsv > plddt.tsv echo -e Positions"\\t"rank_0"\\t"rank_1"\\t"rank_2"\\t"rank_3"\\t"rank_4 > header.tsv cat header.tsv plddt.tsv > ../"${fasta.baseName}"_plddt_mqc.tsv + + extract_output.py --name ${fasta.baseName} \\ + --pkls features.pkl cd .. cat <<-END_VERSIONS > versions.yml @@ -84,10 +89,17 @@ process RUN_ALPHAFOLD2 { """ touch ./"${fasta.baseName}".alphafold.pdb touch ./"${fasta.baseName}"_mqc.tsv + mkdir "${fasta.baseName}" + touch "${fasta.baseName}/ranked_0.pdb" + touch "${fasta.baseName}/ranked_1.pdb" + touch "${fasta.baseName}/ranked_2.pdb" + touch "${fasta.baseName}/ranked_3.pdb" + touch "${fasta.baseName}/ranked_4.pdb" + touch "${fasta.baseName}/${fasta.baseName}_msa.tsv" cat <<-END_VERSIONS > versions.yml "${task.process}": - awk: \$(gawk --version| head -1 | sed 's/GNU Awk //; s/, API:.*//') + python: \$(python3 --version | sed 's/Python //g') END_VERSIONS """ } diff --git a/modules/local/run_alphafold2_msa.nf b/modules/local/run_alphafold2_msa.nf index 85a40676..a4f00676 100644 --- a/modules/local/run_alphafold2_msa.nf +++ b/modules/local/run_alphafold2_msa.nf @@ -29,7 +29,7 @@ process RUN_ALPHAFOLD2_MSA { output: path ("${fasta.baseName}*") - path ("${fasta.baseName}.features.pkl"), emit: features + tuple val(meta), path ("${fasta.baseName}.features.pkl"), emit: features path "versions.yml" , emit: versions when: diff --git a/modules/local/run_alphafold2_pred.nf b/modules/local/run_alphafold2_pred.nf index ee9983c5..247c93e4 100644 --- a/modules/local/run_alphafold2_pred.nf +++ b/modules/local/run_alphafold2_pred.nf @@ -26,11 +26,13 @@ process RUN_ALPHAFOLD2_PRED { path ('uniref90/*') path ('pdb_seqres/*') path ('uniprot/*') - path msa + tuple val(meta), path(msa) output: path ("${fasta.baseName}*") - path "*_mqc.tsv", emit: multiqc + tuple val(meta), path ("${fasta.baseName}/ranked*pdb"), emit: pdb + tuple val(meta), path ("*_msa.tsv"), emit: msa + tuple val(meta), path ("*_mqc.tsv"), emit: multiqc path "versions.yml", emit: versions when: @@ -58,8 +60,10 @@ process RUN_ALPHAFOLD2_PRED { paste ranked_0_plddt.tsv ranked_1_plddt.tsv ranked_2_plddt.tsv ranked_3_plddt.tsv ranked_4_plddt.tsv > plddt.tsv echo -e Positions"\\t"rank_0"\\t"rank_1"\\t"rank_2"\\t"rank_3"\\t"rank_4 > header.tsv cat header.tsv plddt.tsv > ../"${fasta.baseName}"_plddt_mqc.tsv - cd .. + cd .. + extract_output.py --name ${fasta.baseName} \\ + --pkls ${msa} cat <<-END_VERSIONS > versions.yml "${task.process}": python: \$(python3 --version | sed 's/Python //g') @@ -70,10 +74,17 @@ process RUN_ALPHAFOLD2_PRED { """ touch ./"${fasta.baseName}".alphafold.pdb touch ./"${fasta.baseName}"_mqc.tsv + mkdir "${fasta.baseName}" + touch "${fasta.baseName}/ranked_0.pdb" + touch "${fasta.baseName}/ranked_1.pdb" + touch "${fasta.baseName}/ranked_2.pdb" + touch "${fasta.baseName}/ranked_3.pdb" + touch "${fasta.baseName}/ranked_4.pdb" + touch ${fasta.baseName}_msa.tsv cat <<-END_VERSIONS > versions.yml "${task.process}": - awk: \$(gawk --version| head -1 | sed 's/GNU Awk //; s/, API:.*//') + python: \$(python3 --version | sed 's/Python //g') END_VERSIONS """ } diff --git a/modules/local/run_esmfold.nf b/modules/local/run_esmfold.nf index f37c9eb3..41bcadbf 100644 --- a/modules/local/run_esmfold.nf +++ b/modules/local/run_esmfold.nf @@ -14,8 +14,8 @@ process RUN_ESMFOLD { val numRec output: - path ("${fasta.baseName}*.pdb"), emit: pdb - path ("${fasta.baseName}_plddt_mqc.tsv"), emit: multiqc + tuple val(meta), path ("${fasta.baseName}*.pdb") , emit: pdb + tuple val(meta), path ("${fasta.baseName}_plddt_mqc.tsv"), emit: multiqc path "versions.yml", emit: versions when: @@ -36,6 +36,8 @@ process RUN_ESMFOLD { awk '{print \$2"\\t"\$3"\\t"\$4"\\t"\$6"\\t"\$11}' "${fasta.baseName}"*.pdb | grep -v 'N/A' | uniq > plddt.tsv echo -e Atom_serial_number"\\t"Atom_name"\\t"Residue_name"\\t"Residue_sequence_number"\\t"pLDDT > header.tsv cat header.tsv plddt.tsv > "${fasta.baseName}"_plddt_mqc.tsv + mv "${fasta.baseName}"*.pdb tmp.pdb + mv tmp.pdb ${fasta.baseName}.pdb cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/nextflow.config b/nextflow.config index 2bc5ca0a..0ab0fa8e 100644 --- a/nextflow.config +++ b/nextflow.config @@ -81,6 +81,7 @@ params { // Process skipping options skip_multiqc = false + skip_visualisation = false // MultiQC options multiqc_config = null diff --git a/nextflow_schema.json b/nextflow_schema.json index 2d3ce68e..9baab54b 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -203,6 +203,11 @@ "type": "boolean", "description": "Skip MultiQC.", "fa_icon": "fas fa-fast-forward" + }, + "skip_visualisation": { + "type": "boolean", + "description": "Skip visualisation reports.", + "fa_icon": "fas fa-fast-forward" } } }, diff --git a/tower.yml b/tower.yml index 7fc70c10..8ef8077a 100644 --- a/tower.yml +++ b/tower.yml @@ -7,3 +7,9 @@ reports: display: "COLABFOLD MultiQC HTML report" samplesheet.csv: display: "Auto-created samplesheet with collated metadata and FASTQ paths" + "*_alphafold2_report.html": + display: "ALPHAFOLD2 - Predicted structures" + "*_esmfold_report.html": + display: "ESMFOLD - Predicted structures" + "*_colabfold_report.html": + display: "COLABFOLD - Predicted structures" diff --git a/workflows/alphafold2.nf b/workflows/alphafold2.nf index 9a1aebae..9c706114 100644 --- a/workflows/alphafold2.nf +++ b/workflows/alphafold2.nf @@ -57,6 +57,8 @@ workflow ALPHAFOLD2 { main: ch_multiqc_files = Channel.empty() + ch_pdb = Channel.empty() + ch_msa = Channel.empty() // // Create input channel from input file provided through params.input @@ -94,7 +96,9 @@ workflow ALPHAFOLD2 { ch_pdb_seqres, ch_uniprot ) - ch_multiqc_rep = RUN_ALPHAFOLD2.out.multiqc.collect() + ch_pdb = ch_pdb.mix(RUN_ALPHAFOLD2.out.pdb) + ch_msa = ch_msa.mix(RUN_ALPHAFOLD2.out.msa) + ch_multiqc_rep = RUN_ALPHAFOLD2.out.multiqc.map{it[1]}.collect() ch_versions = ch_versions.mix(RUN_ALPHAFOLD2.out.versions) } else if (alphafold2_mode == 'split_msa_prediction') { @@ -134,7 +138,9 @@ workflow ALPHAFOLD2 { ch_uniprot, RUN_ALPHAFOLD2_MSA.out.features ) - ch_multiqc_rep = RUN_ALPHAFOLD2_PRED.out.multiqc.collect() + ch_pdb = ch_pdb.mix(RUN_ALPHAFOLD2_PRED.out.pdb) + ch_msa = ch_msa.mix(RUN_ALPHAFOLD2_PRED.out.msa) + ch_multiqc_rep = RUN_ALPHAFOLD2_PRED.out.multiqc.map{it[1]}.collect() ch_versions = ch_versions.mix(RUN_ALPHAFOLD2_PRED.out.versions) } @@ -142,9 +148,8 @@ workflow ALPHAFOLD2 { // Collate and save software versions // softwareVersionsToYAML(ch_versions) - .collectFile(storeDir: "${params.outdir}/pipeline_info", name: 'nf_core_proteinfold_software_mqc_versions.yml', sort: true, newLine: true) + .collectFile(storeDir: "${params.outdir}/pipeline_info", name: 'nf_core_proteinfold_software_mqc_alphafold2_versions.yml', sort: true, newLine: true) .set { ch_collated_versions } - // // MODULE: MultiQC // @@ -175,6 +180,8 @@ workflow ALPHAFOLD2 { } emit: + pdb = ch_pdb // channel: /path/to/*.pdb + msa = ch_msa // channel: /path/to/*msa.tsv multiqc_report = ch_multiqc_report // channel: /path/to/multiqc_report.html versions = ch_versions // channel: [ path(versions.yml) ] } diff --git a/workflows/colabfold.nf b/workflows/colabfold.nf index eafc222c..1d299cd7 100644 --- a/workflows/colabfold.nf +++ b/workflows/colabfold.nf @@ -131,7 +131,7 @@ workflow COLABFOLD { // Collate and save software versions // softwareVersionsToYAML(ch_versions) - .collectFile(storeDir: "${params.outdir}/pipeline_info", name: 'nf_core_proteinfold_software_mqc_versions.yml', sort: true, newLine: true) + .collectFile(storeDir: "${params.outdir}/pipeline_info", name: 'nf_core_proteinfold_software_mqc_colabfold_versions.yml', sort: true, newLine: true) .set { ch_collated_versions } // @@ -152,7 +152,7 @@ workflow COLABFOLD { ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) ch_multiqc_files = ch_multiqc_files.mix(ch_methods_description.collectFile(name: 'methods_description_mqc.yaml')) ch_multiqc_files = ch_multiqc_files.mix(ch_collated_versions) - ch_multiqc_files = ch_multiqc_files.mix(COLABFOLD_BATCH.out.multiqc.collect()) + ch_multiqc_files = ch_multiqc_files.mix(COLABFOLD_BATCH.out.multiqc.map{it[1]}.collect()) MULTIQC ( ch_multiqc_files.collect(), @@ -164,6 +164,8 @@ workflow COLABFOLD { } emit: + pdb = COLABFOLD_BATCH.out.pdb // channel: /path/to/*.pdb + msa = COLABFOLD_BATCH.out.msa // channel: /path/to/*_coverage.png multiqc_report = ch_multiqc_report // channel: /path/to/multiqc_report.html versions = ch_versions // channel: [ path(versions.yml) ] } diff --git a/workflows/esmfold.nf b/workflows/esmfold.nf index 962c01a1..48b03108 100644 --- a/workflows/esmfold.nf +++ b/workflows/esmfold.nf @@ -80,7 +80,7 @@ workflow ESMFOLD { // Collate and save software versions // softwareVersionsToYAML(ch_versions) - .collectFile(storeDir: "${params.outdir}/pipeline_info", name: 'nf_core_proteinfold_software_mqc_versions.yml', sort: true, newLine: true) + .collectFile(storeDir: "${params.outdir}/pipeline_info", name: 'nf_core_proteinfold_software_mqc_esmfold_versions.yml', sort: true, newLine: true) .set { ch_collated_versions } // @@ -101,7 +101,7 @@ workflow ESMFOLD { ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) ch_multiqc_files = ch_multiqc_files.mix(ch_methods_description.collectFile(name: 'methods_description_mqc.yaml')) ch_multiqc_files = ch_multiqc_files.mix(ch_collated_versions) - ch_multiqc_files = ch_multiqc_files.mix(RUN_ESMFOLD.out.multiqc.collect()) + ch_multiqc_files = ch_multiqc_files.mix(RUN_ESMFOLD.out.multiqc.map{it[1]}.collect()) MULTIQC ( ch_multiqc_files.collect(), @@ -113,8 +113,9 @@ workflow ESMFOLD { } emit: - multiqc_report = ch_multiqc_report // channel: /path/to/multiqc_report.html - versions = ch_versions // channel: [ path(versions.yml) ] + pdb = RUN_ESMFOLD.out.pdb // channel: /path/to/*pdb + multiqc_report = ch_multiqc_report // channel: /path/to/multiqc_report.html + versions = ch_versions // channel: [ path(versions.yml) ] } /*