From ca415cbaf3a72005b89b278760e7331bf0a5e214 Mon Sep 17 00:00:00 2001 From: jonasscheid Date: Sun, 12 Nov 2023 12:14:54 +0000 Subject: [PATCH 01/15] wip ms2rescore --- bin/ms2rescore_cli.py | 95 +++++++++++++++++++++++++++++++++++++ modules/local/ms2rescore.nf | 58 ++++++++++++++++++++++ nextflow.config | 4 ++ workflows/mhcquant.nf | 10 ++++ 4 files changed, 167 insertions(+) create mode 100644 bin/ms2rescore_cli.py create mode 100644 modules/local/ms2rescore.nf diff --git a/bin/ms2rescore_cli.py b/bin/ms2rescore_cli.py new file mode 100644 index 00000000..a5e1951e --- /dev/null +++ b/bin/ms2rescore_cli.py @@ -0,0 +1,95 @@ +#!/usr/bin/env python +# Written by Jonas Scheid under the MIT license + +import sys +import click +import importlib.resources +import json +import logging + +from ms2rescore import rescore, package_data +from psm_utils.io.idxml import IdXMLReader, IdXMLWriter + +logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s') + +def parse_cli_arguments_to_config(**kwargs): + """Update default MS²Rescore config with CLI arguments""" + config = json.load(importlib.resources.open_text(package_data, "config_default.json")) + + for key, value in kwargs.items(): + + # Skip these arguments since they need to set in a nested dict of feature_generators + if key in ['ms2pip_model', 'ms2_tolerance']: + continue + + elif key == 'feature_generators': + feature_generators = value.split(',') + # Reset feature generator dict since there might be default generators we don't want + config["ms2rescore"]["feature_generators"] = {} + if 'basic' in feature_generators: + config["ms2rescore"]["feature_generators"]["basic"] = {} + if 'ms2pip' in feature_generators: + config["ms2rescore"]["feature_generators"]["ms2pip"] = {'model': kwargs['ms2pip_model'], 'ms2_tolerance': kwargs['ms2_tolerance']} + if 'deeplc' in feature_generators: + config["ms2rescore"]["feature_generators"]["deeplc"] = {'deeplc_retrain': False} + if 'maxquant' in feature_generators: + config["ms2rescore"]["feature_generators"]["maxquant"] = {} + if 'ionmob' in feature_generators: + config["ms2rescore"]["feature_generators"]["ionmob"] = {} + + elif key == 'rescoring_engine': + # Reset rescoring engine dict we want to allow only computing features + config["ms2rescore"]["rescoring_engine"] = {} + if value == 'mokapot': + config["ms2rescore"]["rescoring_engine"]["mokapot"] = {"write_weights": True, + "write_txt": False, + "write_flashlfq": False, + "rng": kwargs['rng'], + "max_workers": kwargs['processes']} + if value == 'percolator': + config["ms2rescore"]["rescoring_engine"]["percolator"] = {} + + else: + config["ms2rescore"][key] = value + + return config + + +def rescore_idxml(input_file, output_file, config): + """Rescore PSMs in an idXML file and keep other information unchanged.""" + # Read PSMs + reader = IdXMLReader(input_file) + psm_list = reader.read_file() + + # Rescore + rescore(config, psm_list) + + # Write + writer = IdXMLWriter(output_file, reader.protein_ids, reader.peptide_ids) + writer.write_file(psm_list) + + +@click.command() +@click.option("-p", "--psm_file", help="Path to PSM file (PIN, mzIdentML, MaxQuant msms, X!Tandem XML, idXML)", required=True) +@click.option("-s", "--spectrum_path", help="Path to MGF/mzML spectrum file or directory with spectrum files (default: derived from identification file)", required=True) +@click.option("-o", "--output_path", help="Path and stem for output file names (default: derive from identification file)") +@click.option("-l", "--log_level", help="Logging level (default: `info`)", default="info") +@click.option("-n", "--processes", help="Number of parallel processes available to MS²Rescore", type=int, default=1) +@click.option("-f", "--fasta_file", help="Path to FASTA file") +@click.option("-fg", "--feature_generators", help="Comma-separated list of feature generators to use (default: `basic,ms2pip,deeplc`). See ms2rescore doc for further information", default="basic,ms2pip,deeplc") +@click.option("-am", "--ms2pip_model", help="MS²PIP model (default: `Immuno-HCD`)", default="Immuno-HCD") +@click.option("-ms2tol", "--ms2_tolerance", help="Fragment mass tolerance [Da](default: `0.02`)", type=int, default=0.02) +@click.option("-re", "--rescoring_engine", help="Either mokapot or percolator (default: `mokapot`)", default="mokapot") +@click.option("-rng", "--rng", help="Seed for mokapot's random number generator (default: `4711`)", type=int, default=4711) +@click.option("-d", "--id_decoy_pattern", help="Regex decoy pattern (default: `DECOY_`)", default='^DECOY_') +@click.option("-lsb", "--lower_score_is_better", help="Interpretation of primary search engine score (default: True)", default=True) + + +def main(**kwargs): + config = parse_cli_arguments_to_config(**kwargs) + logging.info("MS²Rescore config:") + logging.info(config) + rescore_idxml(kwargs['psm_file'], kwargs['output_path'], config) + +if __name__ == "__main__": + sys.exit(main()) diff --git a/modules/local/ms2rescore.nf b/modules/local/ms2rescore.nf new file mode 100644 index 00000000..e1373621 --- /dev/null +++ b/modules/local/ms2rescore.nf @@ -0,0 +1,58 @@ +process MS2RESCORE { + tag "$meta.id" + label 'process_high' + + conda "bioconda::ms2rescore=3.0.0b1" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ms2rescore:3.0.0b1--pyhdfd78af_1': + 'biocontainers/ms2rescore:3.0.0b1--pyhdfd78af_1' }" + + input: + tuple val(meta), path(idxml), path(mzml), path(fasta) + + output: + tuple val(meta), path("*ms2rescore.idXML"), emit: rescored_idxml + path "versions.yml" , emit: versions + // TODO add parsing of the html report + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}_ms2rescore" + def ms2pip_model = params.ms2pip_model_name ? "--ms2pip_model ${params.ms2pip_model_name}" : '' + def ms2_tolerance = 2 * float(params.fragment_mass_tolerance) + def rescoring_engine = params.rescoring_engine ? "--rescoring_engine ${params.rescoring_engine}" + + """ + ms2rescore.py \\ + --psm_file $idxml \\ + --spectrum_path $mzml \\ + --output_path ${prefix}.idXML \\ + --processes $task.cpus \\ + --feature_generators basic,ms2pip,deeplc \\ + $ms2pip_model \\ + $ms2_tolerance \\ + $rescoring_engine \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + MS²Rescore: \$(echo \$(ms2rescore --version 2>&1) | grep -oP 'MS²Rescore \(v\K[^\)]+' )) + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}_ms2rescore" + + """ + touch ${prefix}.idXML + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + MS²Rescore: \$(echo \$(ms2rescore --version 2>&1) | grep -oP 'MS²Rescore \(v\K[^\)]+' )) + END_VERSIONS + """ +} diff --git a/nextflow.config b/nextflow.config index 7bcca384..332f170b 100644 --- a/nextflow.config +++ b/nextflow.config @@ -81,6 +81,10 @@ params { use_ms2pip = false ms2pip_model_name = 'Immuno-HCD' + // MS2Rescore settings + use_ms2rescore = false + rescoring_engine = 'mokapot' + // MultiQC options skip_multiqc = false multiqc_config = null diff --git a/workflows/mhcquant.nf b/workflows/mhcquant.nf index 787caa60..ad2d199d 100644 --- a/workflows/mhcquant.nf +++ b/workflows/mhcquant.nf @@ -74,6 +74,7 @@ include { OPENMS_COMETADAPTER } from include { OPENMS_PEPTIDEINDEXER } from '../modules/local/openms_peptideindexer' include { DEEPLC } from '../modules/local/deeplc' include { MS2PIP } from '../modules/local/ms2pip' +include { MS2RESCORE } from '../modules/local/ms2rescore' include { OPENMS_IDFILTER as OPENMS_IDFILTER_Q_VALUE } from '../modules/local/openms_idfilter' include { OPENMS_IDMERGER } from '../modules/local/openms_idmerger' @@ -250,6 +251,15 @@ workflow MHCQUANT { OPENMS_IDMERGER(ch_runs_to_merge) ch_versions = ch_versions.mix(OPENMS_IDMERGER.out.versions.ifEmpty(null)) + // Run MS2Rescore + if params.use_ms2rescore { + MS2RESCORE(OPENMS_IDMERGER.out.idxml) + ch_versions = ch_versions.mix(MS2RESCORE.out.versions.ifEmpty(null)) + ch_rescored_runs = MS2RESCORE.out.rescored_idxml + } else { + ch_rescored_runs = OPENMS_IDMERGER.out.rescored_idxml + } + // Extract PSM features for Percolator OPENMS_PSMFEATUREEXTRACTOR(OPENMS_IDMERGER.out.idxml) ch_versions = ch_versions.mix(OPENMS_PSMFEATUREEXTRACTOR.out.versions.ifEmpty(null)) From 2420fa6b2dfe010b0e52dc27ae643a5ed3a9e239 Mon Sep 17 00:00:00 2001 From: jonasscheid Date: Wed, 29 Nov 2023 11:10:21 +0000 Subject: [PATCH 02/15] Add ms2rescore module, remove ms2pip and deeplc modules --- .github/workflows/ci.yml | 2 +- bin/ms2rescore_cli.py | 7 +- conf/modules.config | 78 +++++++++++-------- ...test_deeplc.config => test_mokapot.config} | 15 ++-- ...t_ms2pip.config => test_percolator.config} | 9 +-- modules/local/ms2rescore.nf | 22 ++---- modules/local/openms_idfilter.nf | 1 + modules/local/openms_idscoreswitcher.nf | 4 +- modules/local/openms_psmfeatureextractor.nf | 23 +----- nextflow.config | 17 +--- nextflow_schema.json | 62 +++++---------- subworkflows/local/quant.nf | 3 +- workflows/mhcquant.nf | 67 ++++++++-------- 13 files changed, 132 insertions(+), 178 deletions(-) mode change 100644 => 100755 bin/ms2rescore_cli.py rename conf/{test_deeplc.config => test_mokapot.config} (69%) rename conf/{test_ms2pip.config => test_percolator.config} (77%) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f8ad5eb4..5e15820b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -55,7 +55,7 @@ jobs: # Test latest edge release of Nextflow - NXF_VER: "" NXF_EDGE: "1" - tests: ["test_deeplc", "test_ms2pip", "test_ionannotator", "test_full"] + tests: ["test_mokapot", "test_percolator", "test_ionannotator", "test_full"] steps: - name: Check out pipeline code uses: actions/checkout@v2 diff --git a/bin/ms2rescore_cli.py b/bin/ms2rescore_cli.py old mode 100644 new mode 100755 index a5e1951e..12ce6b8c --- a/bin/ms2rescore_cli.py +++ b/bin/ms2rescore_cli.py @@ -47,7 +47,8 @@ def parse_cli_arguments_to_config(**kwargs): "rng": kwargs['rng'], "max_workers": kwargs['processes']} if value == 'percolator': - config["ms2rescore"]["rescoring_engine"]["percolator"] = {} + logging.info("Percolator rescoring engine has been specified. Use the idXML containing rescoring features and run Percolator in a separate step.") + continue else: config["ms2rescore"][key] = value @@ -76,9 +77,9 @@ def rescore_idxml(input_file, output_file, config): @click.option("-l", "--log_level", help="Logging level (default: `info`)", default="info") @click.option("-n", "--processes", help="Number of parallel processes available to MS²Rescore", type=int, default=1) @click.option("-f", "--fasta_file", help="Path to FASTA file") -@click.option("-fg", "--feature_generators", help="Comma-separated list of feature generators to use (default: `basic,ms2pip,deeplc`). See ms2rescore doc for further information", default="basic,ms2pip,deeplc") +@click.option("-fg", "--feature_generators", help="Comma-separated list of feature generators to use (default: `ms2pip,deeplc`). See ms2rescore doc for further information", default="") @click.option("-am", "--ms2pip_model", help="MS²PIP model (default: `Immuno-HCD`)", default="Immuno-HCD") -@click.option("-ms2tol", "--ms2_tolerance", help="Fragment mass tolerance [Da](default: `0.02`)", type=int, default=0.02) +@click.option("-ms2tol", "--ms2_tolerance", help="Fragment mass tolerance [Da](default: `0.02`)", type=float, default=0.02) @click.option("-re", "--rescoring_engine", help="Either mokapot or percolator (default: `mokapot`)", default="mokapot") @click.option("-rng", "--rng", help="Seed for mokapot's random number generator (default: `4711`)", type=int, default=4711) @click.option("-d", "--id_decoy_pattern", help="Regex decoy pattern (default: `DECOY_`)", default='^DECOY_') diff --git a/conf/modules.config b/conf/modules.config index 62f0f4e1..7a33093b 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -156,6 +156,23 @@ process { ] } + withName: 'MS2RESCORE' { + ext.args = [ + "--ms2_tolerance ${2 * params.fragment_mass_tolerance}", + "--ms2pip_model ${params.ms2pip_model}", + "--rescoring_engine ${params.rescoring_engine}", + params.feature_generators.trim() ? "--feature_generators ${params.feature_generators}" : '' + ].join(' ').trim() + publishDir = [ + [path: "${params.outdir}/intermediate_results/ms2rescore", + mode: params.publish_dir_mode, + pattern: '*.idXML'], + [path: "${params.outdir}/multiqc/ms2rescore", + mode: params.publish_dir_mode, + pattern: '*.html'] + ] + } + withName: 'OPENMS_PERCOLATORADAPTER' { ext.args = [ "-seed 4711", @@ -199,14 +216,6 @@ process { ] } - withName: 'OPENMS_IDSCORESWITCHER' { - publishDir = [ - mode: params.publish_dir_mode, - pattern: '*.idXML', - enabled: false - ] - } - withName: 'PYOPENMS_IDFILTER' { publishDir = [ mode: params.publish_dir_mode, @@ -255,6 +264,23 @@ process { } +process { + if (!params.skip_quantification) { + withName: 'NFCORE_MHCQUANT:MHCQUANT:QUANT:OPENMS_IDSCORESWITCHER' { + ext.args = [ + "-new_score COMET:xcorr", + "-new_score_orientation higher_better", + "-old_score q-value" + ].join(' ').trim() + publishDir = [ + mode: params.publish_dir_mode, + pattern: '*.idXML', + enabled: false + ] + } + } +} + // Refine on predicted subset process { @@ -469,30 +495,20 @@ process { } } - process { - if (params.use_deeplc) { - withName: 'DEEPLC' { - publishDir = [ - path: {"${params.outdir}/DeepLC"}, - mode: params.publish_dir_mode, - pattern: '*.idXML', - enabled: false - ] + if (params.rescoring_engine == 'mokapot') { + withName: 'NFCORE_MHCQUANT:MHCQUANT:OPENMS_IDSCORESWITCHER' { + ext.prefix = {"${meta.id}"} + ext.args = [ + "-new_score q-value", + "-new_score_orientation lower_better", + "-old_score expect" + ].join(' ').trim() + publishDir = [ + mode: params.publish_dir_mode, + pattern: '*.idXML', + enabled: false + ] } } } - - -process { - if (params.use_ms2pip) { - withName: 'MS2PIP' { - publishDir = [ - path: {"${params.outdir}/MS2PIP"}, - mode: params.publish_dir_mode, - pattern: '*.idXML', - enabled: false - ] - } - } -} diff --git a/conf/test_deeplc.config b/conf/test_mokapot.config similarity index 69% rename from conf/test_deeplc.config rename to conf/test_mokapot.config index dff885b5..3dd49220 100644 --- a/conf/test_deeplc.config +++ b/conf/test_mokapot.config @@ -1,18 +1,18 @@ /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Nextflow config file for running minimal tests with DeepLC + Nextflow config file for running minimal tests with MS²Rescore and Mokapot ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Defines input files and everything required to run a fast and simple pipeline test. Use as follows: - nextflow run nf-core/mhcquant -profile test_deeplc, --outdir + nextflow run nf-core/mhcquant -profile test_mokapot, --outdir ---------------------------------------------------------------------------------------- */ params { - config_profile_name = 'Test DeepLC profile' - config_profile_description = 'Minimal test dataset to check pipeline function with DeepLC' + config_profile_name = 'Test Mokapot profile' + config_profile_description = 'Minimal test dataset to check pipeline function with Mokapot' // Limit resources so that this can run on GitHub Actions max_cpus = 2 @@ -24,9 +24,6 @@ params { input = 'https://raw.githubusercontent.com/nf-core/test-datasets/mhcquant/testdata/HepG2_sample_sheet.tsv' // Don't do quantification since this step needs a larger test dataset (-> test quantification using test_full) - skip_quantification = true - use_deeplc = true - deeplc_add_abs_rt_error = true - deeplc_add_sqr_rt_error = true - deeplc_add_log_rt_error = true + skip_quantification = true + rescoring_engine = 'mokapot' } diff --git a/conf/test_ms2pip.config b/conf/test_percolator.config similarity index 77% rename from conf/test_ms2pip.config rename to conf/test_percolator.config index 6343b683..bb8a95e3 100644 --- a/conf/test_ms2pip.config +++ b/conf/test_percolator.config @@ -1,17 +1,17 @@ /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Nextflow config file for running minimal tests with MS2PIP + Nextflow config file for running minimal tests with MS²Rescore and Percolator ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Defines input files and everything required to run a fast and simple pipeline test. Use as follows: - nextflow run nf-core/mhcquant -profile test_ms2pip, --outdir + nextflow run nf-core/mhcquant -profile test_percolator, --outdir ---------------------------------------------------------------------------------------- */ params { - config_profile_name = 'Test MS2PIP profile' + config_profile_name = 'Test Percolator profile' config_profile_description = 'Minimal test dataset to check pipeline function with MS2PIP' // Limit resources so that this can run on GitHub Actions @@ -25,6 +25,5 @@ params { // Don't do quantification since this step needs a larger test dataset (-> test quantification using test_full) skip_quantification = true - use_ms2pip = true - ms2pip_model_name = 'Immuno-HCD' + rescoring_engine = 'mokapot' } diff --git a/modules/local/ms2rescore.nf b/modules/local/ms2rescore.nf index e1373621..9ade4bf8 100644 --- a/modules/local/ms2rescore.nf +++ b/modules/local/ms2rescore.nf @@ -11,9 +11,10 @@ process MS2RESCORE { tuple val(meta), path(idxml), path(mzml), path(fasta) output: - tuple val(meta), path("*ms2rescore.idXML"), emit: rescored_idxml - path "versions.yml" , emit: versions - // TODO add parsing of the html report + tuple val(meta), path("*ms2rescore.idXML") , emit: idxml + tuple val(meta), path("*feature_names.tsv"), emit: feature_names + tuple val(meta), path("*.html" ) , optional:true, emit: html + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when @@ -21,25 +22,18 @@ process MS2RESCORE { script: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}_ms2rescore" - def ms2pip_model = params.ms2pip_model_name ? "--ms2pip_model ${params.ms2pip_model_name}" : '' - def ms2_tolerance = 2 * float(params.fragment_mass_tolerance) - def rescoring_engine = params.rescoring_engine ? "--rescoring_engine ${params.rescoring_engine}" """ - ms2rescore.py \\ + ms2rescore_cli.py \\ --psm_file $idxml \\ - --spectrum_path $mzml \\ + --spectrum_path . \\ --output_path ${prefix}.idXML \\ --processes $task.cpus \\ - --feature_generators basic,ms2pip,deeplc \\ - $ms2pip_model \\ - $ms2_tolerance \\ - $rescoring_engine \\ $args cat <<-END_VERSIONS > versions.yml "${task.process}": - MS²Rescore: \$(echo \$(ms2rescore --version 2>&1) | grep -oP 'MS²Rescore \(v\K[^\)]+' )) + MS²Rescore: \$(echo \$(ms2rescore --version 2>&1) | grep -oP 'MS²Rescore \\(v\\K[^\\)]+' )) END_VERSIONS """ @@ -52,7 +46,7 @@ process MS2RESCORE { cat <<-END_VERSIONS > versions.yml "${task.process}": - MS²Rescore: \$(echo \$(ms2rescore --version 2>&1) | grep -oP 'MS²Rescore \(v\K[^\)]+' )) + MS²Rescore: \$(echo \$(ms2rescore --version 2>&1) | grep -oP 'MS²Rescore \\(v\\K[^\\)]+' )) END_VERSIONS """ } diff --git a/modules/local/openms_idfilter.nf b/modules/local/openms_idfilter.nf index fb946789..ed8dcf72 100644 --- a/modules/local/openms_idfilter.nf +++ b/modules/local/openms_idfilter.nf @@ -21,6 +21,7 @@ process OPENMS_IDFILTER { def prefix = task.ext.prefix ?: "${meta.id}_filtered" def args = task.ext.args ?: '' + // TODO: Fix such that [] emtpy list is provided as peptide filter, not null if (peptide_filter != null) { args += "-whitelist:peptides $peptide_filter" } diff --git a/modules/local/openms_idscoreswitcher.nf b/modules/local/openms_idscoreswitcher.nf index 1df324f0..9d0f1b03 100644 --- a/modules/local/openms_idscoreswitcher.nf +++ b/modules/local/openms_idscoreswitcher.nf @@ -18,6 +18,7 @@ process OPENMS_IDSCORESWITCHER { task.ext.when == null || task.ext.when script: + // TODO: fix naming to be more generic def prefix = task.ext.prefix ?: "${meta.id}_${meta.sample}_${meta.condition}_switched" def args = task.ext.args ?: '' @@ -25,9 +26,6 @@ process OPENMS_IDSCORESWITCHER { IDScoreSwitcher -in $idxml \\ -out ${prefix}.idXML \\ -threads $task.cpus \\ - -new_score 'COMET:xcorr' \\ - -new_score_orientation 'higher_better' \\ - -old_score 'q-value' \\ $args cat <<-END_VERSIONS > versions.yml diff --git a/modules/local/openms_psmfeatureextractor.nf b/modules/local/openms_psmfeatureextractor.nf index 7ec54ce7..d71fbac2 100644 --- a/modules/local/openms_psmfeatureextractor.nf +++ b/modules/local/openms_psmfeatureextractor.nf @@ -8,7 +8,7 @@ process OPENMS_PSMFEATUREEXTRACTOR { 'biocontainers/openms:3.0.0--h8964181_1' }" input: - tuple val(meta), path(idxml) + tuple val(meta), path(idxml), path(feature_file) output: tuple val(meta), path("*.idXML"), emit: idxml @@ -21,29 +21,14 @@ process OPENMS_PSMFEATUREEXTRACTOR { def prefix = task.ext.prefix ?: "${meta.id}_psm" def args = task.ext.args ?: '' def extra_features = "" - if(params.use_deeplc || params.use_ms2pip){ - extra_features = "-extra" - } - if(params.use_deeplc){ - if(params.deeplc_add_abs_rt_error){ - extra_features = "${extra_features} deeplc_abs_error" - } - if(params.deeplc_add_log_rt_error){ - extra_features = "${extra_features} deeplc_log_error" - } - if(params.deeplc_add_sqr_rt_error || (!params.deeplc_add_sqr_rt_error && !params.deeplc_add_abs_rt_error && !params.deeplc_add_log_rt_error)){ - extra_features = "${extra_features} deeplc_sqr_error" - } - } - if(params.use_ms2pip){ - extra_features = "${extra_features} spectrum_correlation" - } """ + extra_features=\$(awk 'NR > 1 && \$1 !~ /psm_file/ {printf \"%s \", \$2}' ${feature_file}) + PSMFeatureExtractor -in $idxml \\ -out ${prefix}.idXML \\ -threads $task.cpus \\ - $extra_features \\ + -extra \$extra_features \\ $args cat <<-END_VERSIONS > versions.yml diff --git a/nextflow.config b/nextflow.config index 332f170b..cf8a0bbb 100644 --- a/nextflow.config +++ b/nextflow.config @@ -69,21 +69,10 @@ params { annotate_ions = false filter_mzml = false - // DeepLC settings - use_deeplc = false - deeplc_calibration_mode = 'idx_bin' - deeplc_calibration_bins = 20 - deeplc_add_abs_rt_error = false - deeplc_add_sqr_rt_error = false - deeplc_add_log_rt_error = false - - // MS2PIP settings - use_ms2pip = false - ms2pip_model_name = 'Immuno-HCD' - // MS2Rescore settings - use_ms2rescore = false - rescoring_engine = 'mokapot' + rescoring_engine = 'percolator' + feature_generators = 'ms2pip,deeplc' + ms2pip_model = 'Immuno-HCD' // MultiQC options skip_multiqc = false diff --git a/nextflow_schema.json b/nextflow_schema.json index 31135be5..be8f0606 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -277,6 +277,25 @@ "default": 0.01, "description": "Specify the false discovery rate threshold at which peptide hits should be selected." }, + "rescoring_engine": { + "type": "string", + "fa_icon": "fas fa-file-code", + "default": "percolator", + "description": "Specify the rescoring engine that should be used for rescoring. Either percolator or mokapot", + "enum": ["percolator", "mokapot"] + }, + "feature_generators": { + "type": "string", + "fa_icon": "fas fa-file-code", + "default": "ms2pip,deeplc", + "description": "Specify the feature generator that should be used for rescoring. One or multiple of basic,ms2pip,deeplc,ionmob" + }, + "ms2pip_model":{ + "type": "string", + "fa_icon": "fas fa-file-code", + "default": "Immuno-HCD", + "description": "Specify the ms2pip model that should be used for rescoring. Checkout the ms2pip documentation for available models." + }, "refine_fdr_on_predicted_subset": { "type": "boolean", "fa_icon": "fas fa-arrows-repeat", @@ -305,49 +324,6 @@ "type": "integer", "fa_icon": "fas fa-train-track", "description": "Maximum subset for percolator training iterations" - }, - "use_deeplc": { - "type": "boolean", - "fa_icon": "fas fa-microchip", - "description": "Use DeepLC retention time features for Percolator rescoring", - "help_text": "https://www.nature.com/articles/s41592-021-01301-5" - }, - "deeplc_calibration_bins": { - "type": "integer", - "fa_icon": "fas fa-train-track", - "description": "Number of bins (peptides) used for DeepLC calibration. For each bin the best hit is used." - }, - "deeplc_calibration_mode": { - "type": "string", - "fa_icon": "fas fa-train-track", - "description": "Specify the DeepLC calibration mode. rt_bin: bin peptides by RT, idx_bin: bin peptides by index, min_max: scale uncalibrated predictions to experimental RT range", - "enum": ["rt_bin", "idx_bin", "min_max"] - }, - "deeplc_add_abs_rt_error": { - "type": "boolean", - "fa_icon": "fas fa-train-track", - "description": "Add absolute RT error to of experimental and predicted RT to the feature set" - }, - "deeplc_add_sqr_rt_error": { - "type": "boolean", - "fa_icon": "fas fa-train-track", - "description": "Add squared RT error to of experimental and predicted RT to the feature set" - }, - "deeplc_add_log_rt_error": { - "type": "boolean", - "fa_icon": "fas fa-train-track", - "description": "Add log RT error to of experimental and predicted RT to the feature set" - }, - "use_ms2pip": { - "type": "boolean", - "fa_icon": "fas fa-microchip", - "description": "Use MS2pip peak intensity prediction for Percolator rescoring", - "help_text": "https://github.com/compomics/ms2pip" - }, - "ms2pip_model_name": { - "type": "string", - "fa_icon": "fas fa-train-track", - "description": "MS2pip model name defined (https://github.com/compomics/ms2pip#specialized-prediction-models)" } } }, diff --git a/subworkflows/local/quant.nf b/subworkflows/local/quant.nf index 64c3d41d..0318bd34 100644 --- a/subworkflows/local/quant.nf +++ b/subworkflows/local/quant.nf @@ -28,6 +28,7 @@ workflow QUANT { OPENMS_IDRIPPER( merged_pout ).ripped .join( merge_meta_map ) .join( filter_q_value ) + // TODO: fdrfiltered is not needed for idscore switching, but for idfilter. This will be adressed in the next refacoring of the workflow .map { group_meta, ripped, meta, fdrfiltered -> [meta, ripped, fdrfiltered] } .transpose() .set { ch_ripped_pout } @@ -42,7 +43,7 @@ workflow QUANT { } // Filter runs based on fdr filtered coprocessed percolator output. - // NOTE: This is an alternative filtering method that will be replaced by IDFilter with new release of OpenMS + // TODO: This is an alternative filtering method that will be replaced by IDFilter with new release of OpenMS PYOPENMS_IDFILTER( ch_runs_to_be_filtered ).filtered .map { meta, idxml -> [[id:meta.sample + '_' + meta.condition], [id:meta.id, file:idxml]] } .groupTuple( sort: sortById ) diff --git a/workflows/mhcquant.nf b/workflows/mhcquant.nf index ad2d199d..ff7751ea 100644 --- a/workflows/mhcquant.nf +++ b/workflows/mhcquant.nf @@ -74,7 +74,8 @@ include { OPENMS_COMETADAPTER } from include { OPENMS_PEPTIDEINDEXER } from '../modules/local/openms_peptideindexer' include { DEEPLC } from '../modules/local/deeplc' include { MS2PIP } from '../modules/local/ms2pip' -include { MS2RESCORE } from '../modules/local/ms2rescore' +include { MS2RESCORE } from '../modules/local/ms2rescore' +include { OPENMS_IDSCORESWITCHER } from '../modules/local/openms_idscoreswitcher' include { OPENMS_IDFILTER as OPENMS_IDFILTER_Q_VALUE } from '../modules/local/openms_idfilter' include { OPENMS_IDMERGER } from '../modules/local/openms_idmerger' @@ -214,26 +215,8 @@ workflow MHCQUANT { // Run comet database search OPENMS_COMETADAPTER(ch_clean_mzml_file.join(ch_decoy_db, remainder:true)) - // Run DeepLC if specified - if (params.use_deeplc){ - DEEPLC(OPENMS_COMETADAPTER.out.idxml) - ch_versions = ch_versions.mix(DEEPLC.out.versions.ifEmpty(null)) - ch_comet_out_idxml = DEEPLC.out.idxml - } else { - ch_comet_out_idxml = OPENMS_COMETADAPTER.out.idxml - } - - // Run MS2PIP if specified - if (params.use_ms2pip){ - MS2PIP(ch_comet_out_idxml.join(ch_clean_mzml_file)) - ch_versions = ch_versions.mix(MS2PIP.out.versions.ifEmpty(null)) - ch_comet_out_idxml_proceeding = MS2PIP.out.idxml - } else { - ch_comet_out_idxml_proceeding = ch_comet_out_idxml - } - // Index decoy and target hits - OPENMS_PEPTIDEINDEXER(ch_comet_out_idxml_proceeding.join(ch_decoy_db)) + OPENMS_PEPTIDEINDEXER(OPENMS_COMETADAPTER.out.idxml.join(ch_decoy_db)) ch_versions = ch_versions.mix(OPENMS_PEPTIDEINDEXER.out.versions.ifEmpty(null)) // Save indexed runs for later use to keep meta-run information. Sort based on file id @@ -252,24 +235,38 @@ workflow MHCQUANT { ch_versions = ch_versions.mix(OPENMS_IDMERGER.out.versions.ifEmpty(null)) // Run MS2Rescore - if params.use_ms2rescore { - MS2RESCORE(OPENMS_IDMERGER.out.idxml) - ch_versions = ch_versions.mix(MS2RESCORE.out.versions.ifEmpty(null)) - ch_rescored_runs = MS2RESCORE.out.rescored_idxml + ch_clean_mzml_file + .map { meta, mzml -> [[id: meta.sample + '_' + meta.condition], mzml] } + .groupTuple() + .join(OPENMS_IDMERGER.out.idxml) + .map { meta, mzml, idxml -> [meta, idxml, mzml, []] } + .set { ch_ms2rescore_in } + + MS2RESCORE(ch_ms2rescore_in) + ch_versions = ch_versions.mix(MS2RESCORE.out.versions) + + if (params.rescoring_engine == 'percolator') { + // TODO: Find a way to parse the feature names of ms2rescore and plug them into the feature extractor + // Extract PSM features for Percolator + OPENMS_PSMFEATUREEXTRACTOR(MS2RESCORE.out.idxml + .join(MS2RESCORE.out.feature_names)) + ch_versions = ch_versions.mix(OPENMS_PSMFEATUREEXTRACTOR.out.versions.ifEmpty(null)) + + // Run Percolator + OPENMS_PERCOLATORADAPTER(OPENMS_PSMFEATUREEXTRACTOR.out.idxml) + ch_versions = ch_versions.mix(OPENMS_PERCOLATORADAPTER.out.versions.ifEmpty(null)) + ch_rescored_runs = OPENMS_PERCOLATORADAPTER.out.idxml } else { - ch_rescored_runs = OPENMS_IDMERGER.out.rescored_idxml + log.warn "The rescoring engine is set to mokapot. This rescoring engine currently only supports psm-level-fdr via ms2rescore." + // TODO: remove whitelist argument from idscoreswitcher + OPENMS_IDSCORESWITCHER(MS2RESCORE.out.idxml + .map { meta, idxml -> [meta, idxml, []] }) + ch_rescored_runs = OPENMS_IDSCORESWITCHER.out.switched_idxml.map { tuple -> tuple.findAll { it != [] }} } - // Extract PSM features for Percolator - OPENMS_PSMFEATUREEXTRACTOR(OPENMS_IDMERGER.out.idxml) - ch_versions = ch_versions.mix(OPENMS_PSMFEATUREEXTRACTOR.out.versions.ifEmpty(null)) - - // Run Percolator - OPENMS_PERCOLATORADAPTER(OPENMS_PSMFEATUREEXTRACTOR.out.idxml) - ch_versions = ch_versions.mix(OPENMS_PERCOLATORADAPTER.out.versions.ifEmpty(null)) - // Filter by percolator q-value - OPENMS_IDFILTER_Q_VALUE(OPENMS_PERCOLATORADAPTER.out.idxml.flatMap { it -> [tuple(it[0], it[1], null)] }) + // TODO: Use empty list instead of null + OPENMS_IDFILTER_Q_VALUE(ch_rescored_runs.flatMap { it -> [tuple(it[0], it[1], null)] }) ch_versions = ch_versions.mix(OPENMS_IDFILTER_Q_VALUE.out.versions.ifEmpty(null)) // @@ -294,7 +291,7 @@ workflow MHCQUANT { // SUBWORKFLOW: QUANT // if (!params.skip_quantification) { - QUANT(merge_meta_map, OPENMS_PERCOLATORADAPTER.out.idxml, filter_q_value, ch_clean_mzml_file) + QUANT(merge_meta_map, ch_rescored_runs, filter_q_value, ch_clean_mzml_file) ch_versions = ch_versions.mix(QUANT.out.versions.ifEmpty(null)) ch_output = QUANT.out.consensusxml } else { From 1cd26e042850676f6a72bfce7abfbea6fd632370 Mon Sep 17 00:00:00 2001 From: jonasscheid Date: Wed, 29 Nov 2023 12:24:19 +0000 Subject: [PATCH 03/15] update docs --- CHANGELOG.md | 10 ++++++++++ conf/modules.config | 8 ++++---- docs/output.md | 5 ++--- docs/usage.md | 9 ++++++++- nextflow.config | 6 +++--- 5 files changed, 27 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3232c035..ed41acab 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,16 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## v2.6.0dev - nfcore/mhcquant + +### `Added` + +- Adding MS²Rescore module with the underlying python CLI [#288](https://github.com/nf-core/mhcquant/issues/288) + +### `Deprecated` + +- Removed MS²PIP and DeepLC modules. These feature generators are now called via the MS²Rescore framework + ## v2.5.0 - nfcore/mhcquant "Angry Bird" - 2023/10/09 ### `Added` diff --git a/conf/modules.config b/conf/modules.config index 7a33093b..dc59df58 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -150,7 +150,7 @@ process { (params.fdr_threshold == '0.01') ? "-score:pep 0.05" : "-score:pep " + params.fdr_threshold ].join(' ').trim() publishDir = [ - path: {"${params.outdir}/intermediate_results/percolator"}, + path: {"${params.outdir}/intermediate_results/rescoring"}, mode: params.publish_dir_mode, pattern: '*.idXML' ] @@ -164,7 +164,7 @@ process { params.feature_generators.trim() ? "--feature_generators ${params.feature_generators}" : '' ].join(' ').trim() publishDir = [ - [path: "${params.outdir}/intermediate_results/ms2rescore", + [path: "${params.outdir}/intermediate_results/rescoring", mode: params.publish_dir_mode, pattern: '*.idXML'], [path: "${params.outdir}/multiqc/ms2rescore", @@ -185,7 +185,7 @@ process { (params.fdr_level != 'psm_level_fdrs') ? "-" + params.fdr_level : "" ].join(' ').trim() publishDir = [ - path: {"${params.outdir}/intermediate_results/percolator"}, + path: {"${params.outdir}/intermediate_results/rescoring"}, mode: params.publish_dir_mode, pattern: '*.idXML' ] @@ -193,7 +193,7 @@ process { withName: 'OPENMS_PSMFEATUREEXTRACTOR' { publishDir = [ - path: {"${params.outdir}/intermediate_results/percolator"}, + path: {"${params.outdir}/intermediate_results/rescoring"}, mode: params.publish_dir_mode, pattern: '*.idXML' ] diff --git a/docs/output.md b/docs/output.md index 112aa819..e8e0e09a 100644 --- a/docs/output.md +++ b/docs/output.md @@ -74,9 +74,8 @@ This folder contains the intermediate results from various steps of the MHCquant - `alignment`: Contains the `trafoXML` files of each run that document the retention time shift after alignment in quantification mode. - `comet`: Contains pin files generated by comet after database search - - `percolator` - - - `{Sample}_{Condition}_psm.idXML`: File holding extra features that will be used by percolator. Created by [PSMFeatureExtractor](https://openms.de/doxygen/release/3.0.0/html/UTILS_PSMFeatureExtractor.html). + - `rescoring` + - `{Sample}_{Condition}_(psm|ms2rescore).idXML`: File holding extra features generated by MS²Rescore that will be used by percolator or mokapot. - `{Sample}_{Condition}_pout.idXML`: Unfiltered percolator output. - `{Sample}_{Condition}_pout_filtered.idXML`: FDR-filtered percolator output. diff --git a/docs/usage.md b/docs/usage.md index e454a066..65869336 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -57,12 +57,19 @@ ID Sample Condition ReplicateFileName An [example samplesheet](../assets/samplesheet.tsv) has been provided with the pipeline. +## Rescoring using MS²Rescore + +By default the pipline generates additional features using MS²PIP and DeepLC via the MS²Rescore framework (`--feature_generators deeplc,ms2pip`). Additional feature generators can be added (`basic,deeplc,ionmob,maxquant,ms2pip`) to boost identification rates and quality. Please make sure you provide the correct `--ms2pip_model` (default: `Immuno-HCD`). All available MS²PIP models can be found on [GitHub](https://github.com/compomics/ms2pip). + +MS²Rescore creates a comprehensive QC report of the added features used for rescoring. This report is only available if `--rescoring_engine mokapot` is specified (default: `percolator`). The report can be found in `/multiqc/ms2rescore`. Further information on the tool itself can be read up in the published paper [Declerq et al. 2022](https://www.mcponline.org/article/S1535-9476(22)00074-3/fulltext) + + ## Running the pipeline The typical command for running the pipeline is as follows: ```console -nextflow run nf-core/mhcquant --input 'samples.tsv' --outdir --fasta 'SWISSPROT_2020.fasta' --use_deeplc --use_ms2pip -profile docker +nextflow run nf-core/mhcquant --input 'samples.tsv' --outdir --fasta 'SWISSPROT_2020.fasta' -profile docker ``` This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles. diff --git a/nextflow.config b/nextflow.config index cf8a0bbb..4084e8ad 100644 --- a/nextflow.config +++ b/nextflow.config @@ -71,7 +71,7 @@ params { // MS2Rescore settings rescoring_engine = 'percolator' - feature_generators = 'ms2pip,deeplc' + feature_generators = 'deeplc,ms2pip' ms2pip_model = 'Immuno-HCD' // MultiQC options @@ -224,8 +224,8 @@ profiles { executor.memory = 8.GB } test { includeConfig 'conf/test.config' } - test_deeplc { includeConfig 'conf/test_deeplc.config' } - test_ms2pip { includeConfig 'conf/test_ms2pip.config' } + test_mokapot { includeConfig 'conf/test_mokapot.config' } + test_percolator { includeConfig 'conf/test_percolator.config' } test_ionannotator { includeConfig 'conf/test_ionannotator.config' } test_full { includeConfig 'conf/test_full.config' } } From 66b2bb78c8e2bc18d100ea2ed1cb3cadddf612fa Mon Sep 17 00:00:00 2001 From: jonasscheid Date: Wed, 29 Nov 2023 12:28:02 +0000 Subject: [PATCH 04/15] lint --- bin/ms2rescore_cli.py | 93 ++++++++++++++++++++++++++++--------------- docs/output.md | 1 + docs/usage.md | 3 +- nextflow_schema.json | 2 +- 4 files changed, 64 insertions(+), 35 deletions(-) diff --git a/bin/ms2rescore_cli.py b/bin/ms2rescore_cli.py index 12ce6b8c..a580ccf1 100755 --- a/bin/ms2rescore_cli.py +++ b/bin/ms2rescore_cli.py @@ -10,44 +10,51 @@ from ms2rescore import rescore, package_data from psm_utils.io.idxml import IdXMLReader, IdXMLWriter -logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s') +logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") + def parse_cli_arguments_to_config(**kwargs): """Update default MS²Rescore config with CLI arguments""" config = json.load(importlib.resources.open_text(package_data, "config_default.json")) for key, value in kwargs.items(): - # Skip these arguments since they need to set in a nested dict of feature_generators - if key in ['ms2pip_model', 'ms2_tolerance']: + if key in ["ms2pip_model", "ms2_tolerance"]: continue - elif key == 'feature_generators': - feature_generators = value.split(',') + elif key == "feature_generators": + feature_generators = value.split(",") # Reset feature generator dict since there might be default generators we don't want config["ms2rescore"]["feature_generators"] = {} - if 'basic' in feature_generators: + if "basic" in feature_generators: config["ms2rescore"]["feature_generators"]["basic"] = {} - if 'ms2pip' in feature_generators: - config["ms2rescore"]["feature_generators"]["ms2pip"] = {'model': kwargs['ms2pip_model'], 'ms2_tolerance': kwargs['ms2_tolerance']} - if 'deeplc' in feature_generators: - config["ms2rescore"]["feature_generators"]["deeplc"] = {'deeplc_retrain': False} - if 'maxquant' in feature_generators: + if "ms2pip" in feature_generators: + config["ms2rescore"]["feature_generators"]["ms2pip"] = { + "model": kwargs["ms2pip_model"], + "ms2_tolerance": kwargs["ms2_tolerance"], + } + if "deeplc" in feature_generators: + config["ms2rescore"]["feature_generators"]["deeplc"] = {"deeplc_retrain": False} + if "maxquant" in feature_generators: config["ms2rescore"]["feature_generators"]["maxquant"] = {} - if 'ionmob' in feature_generators: + if "ionmob" in feature_generators: config["ms2rescore"]["feature_generators"]["ionmob"] = {} - elif key == 'rescoring_engine': + elif key == "rescoring_engine": # Reset rescoring engine dict we want to allow only computing features config["ms2rescore"]["rescoring_engine"] = {} - if value == 'mokapot': - config["ms2rescore"]["rescoring_engine"]["mokapot"] = {"write_weights": True, - "write_txt": False, - "write_flashlfq": False, - "rng": kwargs['rng'], - "max_workers": kwargs['processes']} - if value == 'percolator': - logging.info("Percolator rescoring engine has been specified. Use the idXML containing rescoring features and run Percolator in a separate step.") + if value == "mokapot": + config["ms2rescore"]["rescoring_engine"]["mokapot"] = { + "write_weights": True, + "write_txt": False, + "write_flashlfq": False, + "rng": kwargs["rng"], + "max_workers": kwargs["processes"], + } + if value == "percolator": + logging.info( + "Percolator rescoring engine has been specified. Use the idXML containing rescoring features and run Percolator in a separate step." + ) continue else: @@ -71,26 +78,48 @@ def rescore_idxml(input_file, output_file, config): @click.command() -@click.option("-p", "--psm_file", help="Path to PSM file (PIN, mzIdentML, MaxQuant msms, X!Tandem XML, idXML)", required=True) -@click.option("-s", "--spectrum_path", help="Path to MGF/mzML spectrum file or directory with spectrum files (default: derived from identification file)", required=True) -@click.option("-o", "--output_path", help="Path and stem for output file names (default: derive from identification file)") +@click.option( + "-p", "--psm_file", help="Path to PSM file (PIN, mzIdentML, MaxQuant msms, X!Tandem XML, idXML)", required=True +) +@click.option( + "-s", + "--spectrum_path", + help="Path to MGF/mzML spectrum file or directory with spectrum files (default: derived from identification file)", + required=True, +) +@click.option( + "-o", "--output_path", help="Path and stem for output file names (default: derive from identification file)" +) @click.option("-l", "--log_level", help="Logging level (default: `info`)", default="info") @click.option("-n", "--processes", help="Number of parallel processes available to MS²Rescore", type=int, default=1) @click.option("-f", "--fasta_file", help="Path to FASTA file") -@click.option("-fg", "--feature_generators", help="Comma-separated list of feature generators to use (default: `ms2pip,deeplc`). See ms2rescore doc for further information", default="") +@click.option( + "-fg", + "--feature_generators", + help="Comma-separated list of feature generators to use (default: `ms2pip,deeplc`). See ms2rescore doc for further information", + default="", +) @click.option("-am", "--ms2pip_model", help="MS²PIP model (default: `Immuno-HCD`)", default="Immuno-HCD") -@click.option("-ms2tol", "--ms2_tolerance", help="Fragment mass tolerance [Da](default: `0.02`)", type=float, default=0.02) +@click.option( + "-ms2tol", "--ms2_tolerance", help="Fragment mass tolerance [Da](default: `0.02`)", type=float, default=0.02 +) @click.option("-re", "--rescoring_engine", help="Either mokapot or percolator (default: `mokapot`)", default="mokapot") -@click.option("-rng", "--rng", help="Seed for mokapot's random number generator (default: `4711`)", type=int, default=4711) -@click.option("-d", "--id_decoy_pattern", help="Regex decoy pattern (default: `DECOY_`)", default='^DECOY_') -@click.option("-lsb", "--lower_score_is_better", help="Interpretation of primary search engine score (default: True)", default=True) - - +@click.option( + "-rng", "--rng", help="Seed for mokapot's random number generator (default: `4711`)", type=int, default=4711 +) +@click.option("-d", "--id_decoy_pattern", help="Regex decoy pattern (default: `DECOY_`)", default="^DECOY_") +@click.option( + "-lsb", + "--lower_score_is_better", + help="Interpretation of primary search engine score (default: True)", + default=True, +) def main(**kwargs): config = parse_cli_arguments_to_config(**kwargs) logging.info("MS²Rescore config:") logging.info(config) - rescore_idxml(kwargs['psm_file'], kwargs['output_path'], config) + rescore_idxml(kwargs["psm_file"], kwargs["output_path"], config) + if __name__ == "__main__": sys.exit(main()) diff --git a/docs/output.md b/docs/output.md index e8e0e09a..d2f460de 100644 --- a/docs/output.md +++ b/docs/output.md @@ -75,6 +75,7 @@ This folder contains the intermediate results from various steps of the MHCquant - `comet`: Contains pin files generated by comet after database search - `rescoring` + - `{Sample}_{Condition}_(psm|ms2rescore).idXML`: File holding extra features generated by MS²Rescore that will be used by percolator or mokapot. - `{Sample}_{Condition}_pout.idXML`: Unfiltered percolator output. - `{Sample}_{Condition}_pout_filtered.idXML`: FDR-filtered percolator output. diff --git a/docs/usage.md b/docs/usage.md index 65869336..221f89a0 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -61,8 +61,7 @@ An [example samplesheet](../assets/samplesheet.tsv) has been provided with the p By default the pipline generates additional features using MS²PIP and DeepLC via the MS²Rescore framework (`--feature_generators deeplc,ms2pip`). Additional feature generators can be added (`basic,deeplc,ionmob,maxquant,ms2pip`) to boost identification rates and quality. Please make sure you provide the correct `--ms2pip_model` (default: `Immuno-HCD`). All available MS²PIP models can be found on [GitHub](https://github.com/compomics/ms2pip). -MS²Rescore creates a comprehensive QC report of the added features used for rescoring. This report is only available if `--rescoring_engine mokapot` is specified (default: `percolator`). The report can be found in `/multiqc/ms2rescore`. Further information on the tool itself can be read up in the published paper [Declerq et al. 2022](https://www.mcponline.org/article/S1535-9476(22)00074-3/fulltext) - +MS²Rescore creates a comprehensive QC report of the added features used for rescoring. This report is only available if `--rescoring_engine mokapot` is specified (default: `percolator`). The report can be found in `/multiqc/ms2rescore`. Further information on the tool itself can be read up in the published paper [Declerq et al. 2022]() ## Running the pipeline diff --git a/nextflow_schema.json b/nextflow_schema.json index be8f0606..1581b689 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -290,7 +290,7 @@ "default": "ms2pip,deeplc", "description": "Specify the feature generator that should be used for rescoring. One or multiple of basic,ms2pip,deeplc,ionmob" }, - "ms2pip_model":{ + "ms2pip_model": { "type": "string", "fa_icon": "fas fa-file-code", "default": "Immuno-HCD", From a02885a743f86a7063fb65545fe946e0939cf914 Mon Sep 17 00:00:00 2001 From: jonasscheid Date: Wed, 29 Nov 2023 15:16:04 +0000 Subject: [PATCH 05/15] update citations --- CITATIONS.md | 32 +++++++++++++++++++++++++++----- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/CITATIONS.md b/CITATIONS.md index 64807a32..d2f1d683 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -10,9 +10,35 @@ ## Pipeline tools +- [OpenMS](https://pubmed.ncbi.nlm.nih.gov/27575624/) + + > Röst H, Sachsenberg T, Aiche S, Bielow C, Weisser H, Aicheler F, Andreotti S, Ehrlich HC, Gutenbrunner P, Kenar E, Liang X, Nahnsen S, Nilse L, Pfeuffer J, Rosenberger G, Rurik M, Schmitt U, Veit J, Walze M, Wojnar D, Wolski WE, Schilling O, Choudhary JS, Malmström L, Aebersold R, Reinert K, Kohlbacher O. OpenMS: a flexible open-source software platform for mass spectrometry data analysis. Nat Methods 13 741–748 (2016). doi: 10.1038/nmeth.3959. PubMed PMID: 27575624 + +- [Comet](https://pubmed.ncbi.nlm.nih.gov/26115965/) + + > Eng JK, Hoopmann MR, Jahan TA, Egertson JD, Noble WS, MacCoss MJ. A Deeper Look into Comet—Implementation and Features. Journal of the American Society for Mass Spectrometry 26,11 1865-1874 (2015). doi: 10.1007/s13361-015-1179-x. PMID: 26115965 + +- [DeepLC](https://pubmed.ncbi.nlm.nih.gov/34711972/) + > Bouwmeester R, Gabriels R, Hulstaert N, Martens L, Degroeve S. DeepLC can predict retention times for peptides that carry as-yet unseen modifications. Nature Methods 18,11 1363-1369 (2021). doi:1 0.1038/s41592-021-01301-5. PMID: 34711972 + +- [MS²PIP](https://pubmed.ncbi.nlm.nih.gov/31028400/) + > Gabriels R, Martens L, Degroeve S. Updated MS²PIP web server delivers fast and accurate MS² peak intensity prediction for multiple fragmentation methods, instruments and labeling techniques. Nucleic Acids Res. 2019 Jul 2;47(W1):W295-W299. doi: 10.1093/nar/gkz299. PMID: 31028400; PMCID: PMC6602496. + +- [MS²Rescore](https://pubmed.ncbi.nlm.nih.gov/35803561/) + > Declercq A, Bouwmeester R, Hirschler A, Carapito C, Degroeve S, Martens L, Gabriels R. MS²Rescore: Data-Driven Rescoring Dramatically Boosts Immunopeptide Identification Rates. Mol Cell Proteomics. 2022 Aug;21(8):100266. doi: 10.1016/j.mcpro.2022.100266. Epub 2022 Jul 6. PMID: 35803561; PMCID: PMC9411678. + +- [Ionmob](https://pubmed.ncbi.nlm.nih.gov/37540201/) + > Teschner D, Gomez-Zepeda D, Declercq A, Łącki MK, Avci S, Bob K, Distler U, Michna T, Martens L, Tenzer S, Hildebrandt A. Ionmob: a Python package for prediction of peptide collisional cross-section values. Bioinformatics. 2023 Sep 2;39(9):btad486. doi: 10.1093/bioinformatics/btad486. PMID: 37540201; PMCID: PMC10521631. + +- [Percolator](https://pubmed.ncbi.nlm.nih.gov/31407580/) + > Halloran JT, Zhang H, Kara K, Renggli C, The M, Zhang C, Rocke DM, Käll L, Noble WS. Speeding Up Percolator. J Proteome Res. 2019 Sep 6;18(9):3353-3359. doi: 10.1021/acs.jproteome.9b00288. Epub 2019 Aug 23. PMID: 31407580; PMCID: PMC6884961. + +- [Mokapot](https://pubmed.ncbi.nlm.nih.gov/33596079/) + > Fondrie WE, Noble WS. mokapot: Fast and Flexible Semisupervised Learning for Peptide Detection. J Proteome Res. 2021 Apr 2;20(4):1966-1971. doi: 10.1021/acs.jproteome.0c01010. Epub 2021 Feb 17. PMID: 33596079; PMCID: PMC8022319. + - [FRED2](https://pubmed.ncbi.nlm.nih.gov/27153717/) - > Schubert B, WalzerM, Brachvogel HP, Szolek A, Mohr C, Kohlbacher O, FRED 2: an immunoinformatics framework for Python, Bioinformatics, Volume 32, Issue 13, 1 July 2016, Pages 2044–2046. doi: 10.109 bioinformatics/btw113. PubMed PMID: 27153717; PubMed Central PMCID; PMC4920123. + > Schubert B, WalzerM, Brachvogel HP, Szolek A, Mohr C, Kohlbacher O, FRED 2: an immunoinformatics framework for Python, Bioinformatics, Volume 32, Issue 13, 1 July 2016, Pages 2044–2046. doi: 10.1093/bioinformatics/btw113. PubMed PMID: 27153717; PubMed Central PMCID; PMC4920123. - [MHCflurry](https://pubmed.ncbi.nlm.nih.gov/27153717/) @@ -22,10 +48,6 @@ > Shao XM, Bhattacharya R, Huang J, Sivakumar IKA, Tokheim C, Zheng L, Hirsch D, Kaminow B, Omdahl A, Bonsack M, Riemer AB, Velculescu VE, Anagnostou V, Pagel KA, Karchin R. High-Throughput Prediction of MHC Class I and II Neoantigens with MHCnuggets. Cancer Immunol Res. 2020 Mar;8(3):396-408. doi: 10.1158/2326-6066.CIR-19-0464. Epub 2019 Dec 23. PMID: 31871119; PMCID: PMC7056596. -- [OpenMS](https://pubmed.ncbi.nlm.nih.gov/27575624/) - - > Röst H, Sachsenberg T, Aiche S, Bielow C, Weisser H, Aicheler F, Andreotti S, Ehrlich HC, Gutenbrunner P, Kenar E, Liang X, Nahnsen S, Nilse L, Pfeuffer J, Rosenberger G, Rurik M, Schmitt U, Veit J, Walze M, Wojnar D, Wolski WE, Schilling O, Choudhary JS, Malmström L, Aebersold R, Reinert K, Kohlbacher O. OpenMS: a flexible open-source software platform for mass spectrometry data analysis. Nat Methods 13 741–748 (2016). doi: 10.1038/nmeth.3959. PubMed PMID: 27575624 - - [MultiQC](https://pubmed.ncbi.nlm.nih.gov/27312411/) > Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016 Oct 1;32(19):3047-8. doi: 10.1093/bioinformatics/btw354. Epub 2016 Jun 16. PubMed PMID: 27312411; PubMed Central PMCID: PMC5039924. From 6abfde029c7eff2c80964864848d1782f8642b3a Mon Sep 17 00:00:00 2001 From: jonasscheid Date: Thu, 30 Nov 2023 10:22:41 +0000 Subject: [PATCH 06/15] bump to dev version --- nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index 4084e8ad..b08086d1 100644 --- a/nextflow.config +++ b/nextflow.config @@ -282,7 +282,7 @@ manifest { description = """Identify and quantify peptides from mass spectrometry raw data""" mainScript = 'main.nf' nextflowVersion = '!>=23.04.0' - version = '2.5.0' + version = '2.6.0dev' doi = '10.1021/acs.jproteome.9b00313' } From 37ee492c6c1b087e43139852aaa7d06de84f7f57 Mon Sep 17 00:00:00 2001 From: jonasscheid Date: Thu, 30 Nov 2023 12:12:24 +0000 Subject: [PATCH 07/15] remove test_full from gh workflow --- .github/workflows/ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5e15820b..bd5f86ab 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -55,10 +55,10 @@ jobs: # Test latest edge release of Nextflow - NXF_VER: "" NXF_EDGE: "1" - tests: ["test_mokapot", "test_percolator", "test_ionannotator", "test_full"] + tests: ["test_mokapot", "test_percolator", "test_ionannotator"] steps: - name: Check out pipeline code - uses: actions/checkout@v2 + uses: actions/checkout@v3 - name: Install Nextflow env: NXF_VER: ${{ matrix.NXF_VER }} From d2d2cb081d142755c9cd2b7f3ebc223d46605035 Mon Sep 17 00:00:00 2001 From: jonasscheid Date: Thu, 30 Nov 2023 12:36:56 +0000 Subject: [PATCH 08/15] depricate ms2pip and deeplc cli --- bin/deeplc_cli.py | 418 ---------------------------------------- bin/ms2pip_cli.py | 247 ------------------------ modules/local/deeplc.nf | 41 ---- modules/local/ms2pip.nf | 41 ---- 4 files changed, 747 deletions(-) delete mode 100755 bin/deeplc_cli.py delete mode 100755 bin/ms2pip_cli.py delete mode 100644 modules/local/deeplc.nf delete mode 100644 modules/local/ms2pip.nf diff --git a/bin/deeplc_cli.py b/bin/deeplc_cli.py deleted file mode 100755 index 0ab42681..00000000 --- a/bin/deeplc_cli.py +++ /dev/null @@ -1,418 +0,0 @@ -#!/usr/bin/env python -# Written by Jonas Scheid and Steffen Lemke - -import click -import logging -import math -import os -import pandas as pd -import numpy as np -import sys -import tensorflow as tf -from deeplc import DeepLC -from pyopenms import IdXMLFile -from sklearn.preprocessing import MinMaxScaler - -os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" # Set TensorFlow logging level to suppress warnings -tf.get_logger().setLevel(logging.ERROR) # Filter out specific warnings - -# initate logger -console = logging.StreamHandler() -formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") -console.setFormatter(formatter) -LOG = logging.getLogger("DeepLC prediction") -LOG.addHandler(console) -LOG.setLevel(logging.INFO) - - -def parse_idxml(path: str) -> tuple[list, list]: - """ - Parse idXML file and return PeptideIdentification and ProteinIdentification objects. - - :param path: path to idXML file - :type path: str - :return: ProteinIdentification and PeptideIdentification objects - :rtype: (list, list) - """ - protein_ids = [] - peptide_ids = [] - IdXMLFile().load(path, protein_ids, peptide_ids) - - return protein_ids, peptide_ids - - -def generate_deeplc_input(peptide_ids: list) -> pd.DataFrame: - """ - Generate input for DeepLC from PeptideIdentification objects. - - :param peptide_ids: list of PeptideIdentification objects - :type peptide_ids: list - :return: Pandas DataFrame containing the input for DeepLC - :rtype: pd.DataFrame - - """ - data = [] - for peptide_id in peptide_ids: - tr = peptide_id.getRT() - scan_id = peptide_id.getMetaValue("spectrum_reference") - for hit in peptide_id.getHits(): - sequence = hit.getSequence() - unmodified_sequence = sequence.toUnmodifiedString() - x_corr = hit.getMetaValue("MS:1002252") - target_decoy = hit.getMetaValue("target_decoy") - - # get all modificaitons - hit_mods = [] - for pos in range(0, sequence.size()): - residue = sequence.getResidue(pos) - if residue.isModified(): - hit_mods.append("|".join([str(pos + 1), residue.getModificationName()])) - if hit_mods == []: - modifications = "" - else: - modifications = "|".join(hit_mods) - - data.append([unmodified_sequence, modifications, tr, x_corr, target_decoy, str(sequence), scan_id]) - - df_deeplc_input = pd.DataFrame( - data, columns=["seq", "modifications", "tr", "x_corr", "target_decoy", "seq_with_mods", "scan_id"] - ) - - return df_deeplc_input - - -def generate_calibration_df(df: pd.DataFrame, num_bins: int) -> pd.DataFrame: - """ - Generates a pandas DataFrame containing calibration peptides for DeepLC. - The input DataFrame is sorted by measured retention time and sliced into - bins of equal peptide count. For each bin the peptides with the highest - x_correlation is selected and returned in a Pandas DataFrame - - :param df: Input DataFrame with retention time of each peptide and xcorr score - :type df: pd.DataFrame - :param num_bins: Number of bins/number of calibratoin peptides to be extracted - :type num_bins: int - :return: Pandas DataFrame containing calibration peptides equal to index-based num_bins - :rtype: pd.DataFrame - - """ - # remove decoys - df = df[df["target_decoy"] != "decoy"] - - # Compute the bin size based on the number of bins - bin_size = len(df) // num_bins - - # Sort the dataframe by tr values - sorted_df = df.sort_values("tr") - - # Rows for dataframe - filtered_row = [] - - # Iterate over the bins - for i in range(num_bins): - # Get the start and end indices of the current bin - start_index = i * bin_size - end_index = start_index + bin_size - - # Get the subset of the dataframe for the current bin - bin_df = sorted_df.iloc[start_index:end_index] - - # Find the row with the maximum x_corr value in the current bin - max_row = bin_df.loc[bin_df["x_corr"].idxmax()] - - # Append the max row to the filtered dataframe - filtered_row.append(max_row) - - # Create DataFrame - calibration_df = pd.DataFrame(filtered_row) - - return calibration_df.copy() - - -def generate_calibration_df_with_RT_bins(df: pd.DataFrame, num_bins: int) -> pd.DataFrame: - """ - Generates a pandas DataFrame containing calibration peptides for DeepLC. - The input DataFrame is sorted by measured retention time and sliced into bins of equal retention time. - For each bin the peptides with the highest x_correlation is selected and return in a Pandas DataFrame - - :param df: Input DataFrame with retention time of each peptide and xcorr score - :type df: pd.DataFrame - :param num_bins: Number of bins/number of calibratoin peptides to be extracted - :type num_bins: int - :return: Pandas DataFrame containing calibration peptides equal to RT-based num_bins - :rtype: pd.DataFrame - """ - # remove decoys - df = df[df["target_decoy"] != "decoy"] - - # Sort the dataframe by tr values - sorted_df = df.sort_values("tr") - - # Create list of linear bins between min and max tr with num_bins and access dataframe with index - bin_size = (sorted_df["tr"].max() - sorted_df["tr"].min()) / num_bins - - # Rows for dataframe - filtered_row = [] - - # Iterate over the bins - for i in range(num_bins): - # Get the start and end indices of the current bin - start_tr = sorted_df["tr"].min() + i * bin_size - end_tr = start_tr + bin_size - - # Get the subset of the dataframe for the current bin - bin_df = sorted_df[(sorted_df["tr"] >= start_tr) & (sorted_df["tr"] < end_tr)] - - # skip if bin is empty (no measurements in RT bin) - if len(bin_df) == 0: - continue - - # Find the row with the maximum x_corr value in the current bin - max_row = bin_df.loc[bin_df["x_corr"].idxmax()] - - # Append the max row to the filtered dataframe - filtered_row.append(max_row) - - # Create DataFrame - calibration_df = pd.DataFrame(filtered_row) - - return calibration_df - - -def min_max_scaler(df: pd.DataFrame) -> pd.DataFrame: - """ - Scales the predicted retention time values of the input DataFrame to the range of the measured retention time values - - :param df: Input DataFrame with predicted retention time values - :type df: pd.DataFrame - :return: DataFrame with scaled predicted retention time values - :rtype: pd.DataFrame - """ - scaler = MinMaxScaler((min(df["tr"]), max(df["tr"]))) - df["predicted_RT"] = scaler.fit_transform(df[["predicted_RT"]]) - - return df - - -def run_deeplc(df: pd.DataFrame, calibration_df: pd.DataFrame = None) -> pd.DataFrame: - dlc = DeepLC() - if calibration_df is not None: - dlc.calibrate_preds(seq_df=calibration_df) - preds = dlc.make_preds(seq_df=df) - df["predicted_RT"] = preds - else: - preds = dlc.make_preds(seq_df=df, calibrate=False) - df["predicted_RT"] = preds - df = min_max_scaler(df) - - return df - - -def add_rt_error( - peptide_ids: list, - prediction_dict: dict, - add_abs_rt_error: bool = False, - add_sqr_rt_error: bool = False, - add_log_rt_error: bool = False, -) -> list: - """ - Adds the error of the predicted retention time in comparison to the measured retention time to each peptide hit. - Different error scores can be selected. - - :param peptide_ids: list of PeptideIdentification objects - :type peptide_ids: list - :param prediction_dict: dictionary containing the predicted retention time for each peptide sequence - :type prediction_dict: dict - :param add_abs_rt_error: add absolute RT prediction errors to idXML - :type add_abs_rt_error: bool - :param add_sqr_rt_error: add squared RT prediction errors to idXML - :type add_sqr_rt_error: bool - :param add_log_rt_error: add log RT prediction errors to idXML - :type add_log_rt_error: bool - :return: list of PeptideIdentification objects with added error scores - :rtype: list - """ - noncanonical_aa = ["B", "J", "O", "U", "X", "Z"] - peptide_hits_noncanonical_aa = {} - abs_rt_errors = [] - sqr_rt_errors = [] - log_rt_errors = [] - - for peptide_id in peptide_ids: - # Get measured Retention time - measured_rt = peptide_id.getRT() - scan_id = peptide_id.getMetaValue("spectrum_reference") - - # Initilaize list for edited hits (with added features) - new_hits = [] - for hit in peptide_id.getHits(): - sequence = hit.getSequence() - unmodified_sequence = sequence.toUnmodifiedString() - # Catch peptides with noncanonical amino acids and save spectrum reference and hit in dictionary - if any(aa in noncanonical_aa for aa in unmodified_sequence): - peptide_hits_noncanonical_aa[(peptide_id.getMetaValue("spectrum_reference"), sequence)] = hit - continue - - predicted_rt = prediction_dict[(str(sequence), scan_id)] - - # calculate abs error - if add_abs_rt_error: - abs_error = abs(measured_rt - predicted_rt) - hit.setMetaValue("deeplc_abs_error", abs_error) - abs_rt_errors.append(abs_error) - - # calculate seq error - if add_sqr_rt_error: - sqr_error = abs(measured_rt - predicted_rt) ** 2 - hit.setMetaValue("deeplc_sqr_error", sqr_error) - sqr_rt_errors.append(sqr_error) - - # calcultae log error - if add_log_rt_error: - log_error = math.log(abs(measured_rt - predicted_rt)) - hit.setMetaValue("deeplc_log_error", log_error) - log_rt_errors.append(log_error) - - new_hits.append(hit) - peptide_id.setHits(new_hits) - - # Add peptides with noncanonical amino acids to peptide_ids and return the median error - for scan_id, sequence in peptide_hits_noncanonical_aa.keys(): - LOG.info( - f"Peptide {sequence} hit of spectrum {scan_id} contains noncanonical amino acids. Adding median error(s)" - ) - # get peptide id for scan id - peptide_id = [ - peptide_id for peptide_id in peptide_ids if peptide_id.getMetaValue("spectrum_reference") == scan_id - ][0] - hit = peptide_hits_noncanonical_aa[(scan_id, sequence)] - if add_abs_rt_error: - hit.setMetaValue("deeplc_abs_error", np.median(abs_rt_errors)) - if add_sqr_rt_error: - hit.setMetaValue("deeplc_sqr_error", np.median(sqr_rt_errors)) - if add_log_rt_error: - hit.setMetaValue("deeplc_log_error", np.median(log_rt_errors)) - peptide_id.insertHit(hit) - - peptide_ids = peptide_ids - - return peptide_ids - - -@click.command() -@click.option("-i", "--input", help="input path of idXML", required=True) -@click.option("-o", "--output", help="output path of idXML", required=True) -@click.option( - "--calibration_mode", - type=click.Choice(["idx_bin", "rt_bin", "min_max"]), - default="idx_bin", - help="Calibration method", -) -@click.option( - "--calibration_bins", - type=click.IntRange(min=2), - default=20, - help="number of bins for calibration", -) -@click.option( - "--add_abs_rt_error", - is_flag=True, - help="add absolute RT prediction errors to idXML", -) -@click.option( - "--add_sqr_rt_error", - is_flag=True, - help="add squared RT prediction errors to idXML (default if nothing is selected)", -) -@click.option("--add_log_rt_error", is_flag=True, help="add log RT prediction errors to idXML") -@click.option( - "--debug", - is_flag=True, - help="Additionally write out calibration file and deeplc output", -) -def main( - input: str, - output: str, - calibration_mode: str, - calibration_bins: int, - add_abs_rt_error: bool, - add_sqr_rt_error: bool, - add_log_rt_error: bool, - debug: bool, -): - # check if at least one error is selected, if not set squared error to true - num_true = sum([add_abs_rt_error, add_sqr_rt_error, add_log_rt_error]) - if num_true == 0: - LOG.info("No error calculation was set, falling back to squared error") - add_sqr_rt_error = True - - LOG.info("Parse idXML") - protein_ids, peptide_ids = parse_idxml(input) - - LOG.info("Generate DeepLC input") - df_deeplc_input = generate_deeplc_input(peptide_ids) - # Skip sequences with noncanonical amino acids, DeepLC cannot predict them - # Add them later with median error - df_deeplc_input = df_deeplc_input[~df_deeplc_input["seq"].str.contains("B|J|O|U|X|Z")] - - if len(df_deeplc_input[df_deeplc_input["target_decoy"] != "decoy"]) <= calibration_bins: - LOG.info("Number of peptide hits is smaller than calibration bins. Falling back to min/max scaling") - calibration_mode = "min_max" - - # Run DeepLC - if calibration_mode == "rt_bin": - LOG.info("Run DeepLC with RT bin calibration") - calibration_df = generate_calibration_df_with_RT_bins(df_deeplc_input, calibration_bins) - if debug: - calibration_df.to_csv(output + "_calibration.tsv", index=False, sep="\t") - df_deeplc_input.to_csv(output + "_deeplc_input.tsv", index=False, sep="\t") - df_deeplc_output = run_deeplc(df_deeplc_input, calibration_df) - - elif calibration_mode == "idx_bin": - LOG.info("Run DeepLC with index bin calibration") - calibration_df = generate_calibration_df(df_deeplc_input, calibration_bins) - if debug: - calibration_df.to_csv(output + "_calibration.tsv", index=False, sep="\t") - df_deeplc_input.to_csv(output + "_deeplc_input.tsv", index=False, sep="\t") - df_deeplc_output = run_deeplc(df_deeplc_input, calibration_df) - - elif calibration_mode == "min_max": - LOG.info("Run DeepLC with min/max calibration") - if debug: - df_deeplc_input.to_csv(output + "_deeplc_input.tsv", index=False, sep="\t") - df_deeplc_output = run_deeplc(df_deeplc_input) - - if debug: - df_deeplc_output.to_csv(output + "_deeplc_output.tsv", index=False, sep="\t") - - # Create map containing the predicted retention time for each peptide sequence and modification - sequence_to_prediction = {} - for seq_mod, scan_id, pred_rt in zip( - df_deeplc_output["seq_with_mods"], - df_deeplc_output["scan_id"], - df_deeplc_output["predicted_RT"], - ): - sequence_to_prediction[(seq_mod, scan_id)] = pred_rt - - LOG.info("Add error to idXML") - peptide_ids_pred_RT = add_rt_error( - peptide_ids, - sequence_to_prediction, - add_abs_rt_error, - add_sqr_rt_error, - add_log_rt_error, - ) - - LOG.info("Write idXML") - IdXMLFile().store(output, protein_ids, peptide_ids_pred_RT) - - if debug: - df_deeplc_input.to_csv(output + "_deeplc_input.tsv", index=False, sep="\t") - if calibration_mode == "rt_bin" or calibration_mode == "idx_bin": - calibration_df.to_csv(output + "_calibration.tsv", index=False, sep="\t") - - return 0 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/bin/ms2pip_cli.py b/bin/ms2pip_cli.py deleted file mode 100755 index 35df9eda..00000000 --- a/bin/ms2pip_cli.py +++ /dev/null @@ -1,247 +0,0 @@ -#!/usr/bin/env python -# Written by Jonas Scheid and Steffen Lemke - -import click -import logging -import numpy as np -import pandas as pd -import sys -from ms2pip.ms2pipC import MS2PIP -from pyopenms import IdXMLFile, ModificationsDB - - -# initate logger -console = logging.StreamHandler() -formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") -console.setFormatter(formatter) -LOG = logging.getLogger("MS2pip prediction") -LOG.addHandler(console) -LOG.setLevel(logging.INFO) - - -def parse_idxml(path: str) -> tuple[list, list]: - """ - Parse idXML file and return PeptideIdentification and ProteinIdentification objects. - - :param path: path to idXML file - :type path: str - :return: ProteinIdentification and PeptideIdentification objects - :rtype: (list, list) - """ - protein_ids = [] - peptide_ids = [] - IdXMLFile().load(path, protein_ids, peptide_ids) - - return protein_ids, peptide_ids - - -def peptide_ids_to_peprec_dataframe(peptide_ids: list, hit_idx: int = 0) -> pd.DataFrame: - """ - All the peptide identifications are parsed into a DataFrame in the style of - a PEPREC file (https://github.com/compomics/ms2pip#peprec-file). - - :param peptide_ids: List containing PeptideIdentification - :type peptide_ids: list - :param hit_idx: hit index to generate a peprec - :type hit_idx: int - :return: peprec pandas dataframe - :rtype: pd.DataFrame - """ - - columns = ["spec_id", "modifications", "peptide", "charge"] - data = [] - spectrum_reference_to_seq = {} - - for peptide_id in peptide_ids: - if len(peptide_id.getHits()) <= hit_idx: - continue - hit = peptide_id.getHits()[hit_idx] - spectrum_reference = peptide_id.getMetaValue("spectrum_reference") - - charge = hit.getCharge() - sequence = hit.getSequence() - unmodified_sequence = sequence.toUnmodifiedString() - - spectrum_reference_to_seq[spectrum_reference] = str(sequence) - - hit_mods = [] - for pos in range(0, sequence.size()): - residue = sequence.getResidue(pos) - if residue.isModified(): - hit_mods.append("|".join([str(pos + 1), residue.getModificationName()])) - if hit_mods == []: - modifications = "-" - else: - modifications = "|".join(hit_mods) - - data.append([spectrum_reference, modifications, unmodified_sequence, charge]) - - return pd.DataFrame(data, columns=columns), spectrum_reference_to_seq - - -def get_complete_spectrum_correlation(df_ms2pip_output: pd.DataFrame, method: str) -> pd.DataFrame: - """ - Get correlation coefficient for each predicted spectrum vs the measured one - - :param df_ms2pip_output: pandas dataframe of the ms2pip output with individual ion prediction values and DeepLC RT prediction - :type hit_idx: pd.DataFrame - :return: dict {: , : {}, ... } - :rtype: pd.DataFrame - """ - scannr_to_total_corr = {} - grouped_spec = df_ms2pip_output.groupby("spec_id") - correlations_spec = grouped_spec[["prediction", "target"]].corr(method=method) - - for group, corr in correlations_spec.groupby(level=[0, 1]): - correlation_value = corr.iloc[0, 1] - spec_id = group[0] - if group[1] == "prediction": - if np.isnan(correlation_value): - correlation_value = 0 - scannr_to_total_corr[spec_id] = correlation_value - - data = { - "ScanNr": scannr_to_total_corr.keys(), - "ion_corr": scannr_to_total_corr.values(), - } - df = pd.DataFrame.from_dict(data) - - return df - - -def generate_params_config( - fixed_modifications: list, - variable_modifications: list, - model_name: str, - fragment_error: float, -) -> dict: - """ - Generate the MS2PIP configuration file. - - :param fixed_modifications: List of fixed modifications to consider - :type fixed_modifications: list - :param modifications: List of modifications to consider - :type modifications: list - :param model_name: Name of the model to use - :type model_name: str - :param fragment_error: Fragment error to use - :type fragment_error: float - :return: MS2PIP configuration file - :rtype: dict - """ - mods = set(fixed_modifications.strip().split(",") + variable_modifications.strip().split(",")) - # Remove empty strings - mods = [mod for mod in mods if mod] - params = { - "ms2pip": { - "ptm": [ - f"{ModificationsDB().getModification(mod).getId()},{ModificationsDB().getModification(mod).getDiffMonoMass()},opt,{ModificationsDB().getModification(mod).getOrigin()}" - for mod in mods - ], - "model": model_name, - "frag_error": fragment_error, - "out": "csv", - "sptm": [], - "gptm": [], - } - } - return params - - -@click.command() -@click.option("--input_idxml", help="input path of idXML", required=True) -@click.option("--input_mzml", help="input path of mzML", required=True) -@click.option("--output_idxml", help="output path of idXML", required=True) -@click.option( - "--num_hits", - type=click.IntRange(min=1), - default=1, - help="number of peptides hits", -) -@click.option( - "--model_name", - type=str, - help="Name of MS2pip model (https://github.com/compomics/ms2pip#specialized-prediction-models)", -) -@click.option( - "--model_path", - type=str, - help="path to MS2pip model", -) -@click.option( - "--fragment_error", - type=float, - help="Fragment mass error in Da", -) -@click.option( - "--variable_mods", - type=str, - help="List of variable modifications", -) -@click.option( - "--fixed_mods", - type=str, - help="List of fixed modifications", -) -@click.option("--add_pearson", is_flag=True, help="add pearson spectrum simliartity") -@click.option( - "--num_cpus", - type=int, - help="number of cpus to use", -) -def main( - input_idxml: str, - input_mzml: str, - output_idxml: str, - num_hits: int, - model_name: str, - model_path: str, - fragment_error: float, - variable_mods: str, - fixed_mods: str, - add_pearson: bool, - num_cpus: int, -): - LOG.info("Parse idXML") - protein_ids, peptide_ids = parse_idxml(input_idxml) - - LOG.info("Generate params file for MS2pip") - params = generate_params_config(fixed_mods, variable_mods, model_name, fragment_error) - - LOG.info("Make MS2pip predictions") - scan_nr_seq_to_corr = {} - for hit_idx in range(num_hits): # number of hits to consider - df_peprec, scan_nr_to_seq = peptide_ids_to_peprec_dataframe(peptide_ids, hit_idx) - ms2pip = MS2PIP(pep_file=df_peprec, spec_file=input_mzml, params=params, return_results=True, num_cpu=num_cpus) - predictions = ms2pip.run() - correlation_df = get_complete_spectrum_correlation(predictions, "pearson") - - for scan_nr, ion_corr in zip(correlation_df["ScanNr"], correlation_df["ion_corr"]): - sequence = scan_nr_to_seq[scan_nr] - scan_nr_seq_to_corr[(scan_nr, sequence)] = ion_corr - - LOG.info("Add correlations scores to peptide identifications") - for peptide_id in peptide_ids: - spectrum_reference = peptide_id.getMetaValue("spectrum_reference") - new_hits = [] - for hit in peptide_id.getHits(): - sequence = str(hit.getSequence()) - if (spectrum_reference, sequence) in scan_nr_seq_to_corr.keys(): - hit.setMetaValue( - "spectrum_correlation", - scan_nr_seq_to_corr[(spectrum_reference, sequence)], - ) - else: - LOG.info(f"No correlation could be computed for {str(sequence)}") - hit.setMetaValue("spectrum_correlation", 0) - new_hits.append(hit) - peptide_id.setHits(new_hits) - - LOG.info("Write idXML") - IdXMLFile().store(output_idxml, protein_ids, peptide_ids) - - return 0 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/modules/local/deeplc.nf b/modules/local/deeplc.nf deleted file mode 100644 index 5f7ea37d..00000000 --- a/modules/local/deeplc.nf +++ /dev/null @@ -1,41 +0,0 @@ -process DEEPLC { - tag "$meta.id" - label 'process_medium' - - conda "bioconda::deeplc=2.2.0 bioconda::pyopenms=2.9.1" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/mulled-v2-beb85d5ee68ba9251d26079ca28797d51ea3c49a:857e5e7908422b6ea5016a3c313f67087fbe2f8b-0' : - 'biocontainers/mulled-v2-beb85d5ee68ba9251d26079ca28797d51ea3c49a:857e5e7908422b6ea5016a3c313f67087fbe2f8b-0' }" - - input: - tuple val(meta), path(idxml_in) - - output: - tuple val(meta), path('*.idXML'), emit: idxml - path "versions.yml", emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def prefix = idxml_in.baseName - def add_abs_rt_error = params.deeplc_add_abs_rt_error ? "--add_abs_rt_error" : "" - def add_sqr_rt_error = params.deeplc_add_sqr_rt_error ? "--add_sqr_rt_error" : "" - def add_log_rt_error = params.deeplc_add_log_rt_error ? "--add_log_rt_error" : "" - - """ - deeplc_cli.py \\ - --input $idxml_in \\ - --output ${prefix}_deeplc.idXML \\ - --calibration_mode ${params.deeplc_calibration_mode} \\ - --calibration_bins ${params.deeplc_calibration_bins} \\ - $add_abs_rt_error \\ - $add_sqr_rt_error \\ - $add_log_rt_error - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - DeepLC: \$(deeplc --version) - END_VERSIONS - """ -} diff --git a/modules/local/ms2pip.nf b/modules/local/ms2pip.nf deleted file mode 100644 index c66cb357..00000000 --- a/modules/local/ms2pip.nf +++ /dev/null @@ -1,41 +0,0 @@ -process MS2PIP { - tag "$meta.id" - label 'process_low' - - conda "bioconda::ms2pip=3.11.0 bioconda::pyopenms=2.9.1" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/mulled-v2-beb85d5ee68ba9251d26079ca28797d51ea3c49a:857e5e7908422b6ea5016a3c313f67087fbe2f8b-0' : - 'biocontainers/mulled-v2-beb85d5ee68ba9251d26079ca28797d51ea3c49a:857e5e7908422b6ea5016a3c313f67087fbe2f8b-0' }" - - input: - tuple val(meta), path(idxml_in), path(mzml) - - output: - tuple val(meta), path('*.idXML'), emit: idxml - path "versions.yml", emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def prefix = idxml_in.baseName - def fragment_error = params.fragment_mass_tolerance * 2 - - """ - ms2pip_cli.py \\ - --input_idxml $idxml_in \\ - --input_mzml $mzml \\ - --output_idxml ${prefix}_ms2pip.idXML \\ - --num_hits ${params.num_hits} \\ - --model_name ${params.ms2pip_model_name} \\ - --fragment_error $fragment_error \\ - --variable_mods '${params.variable_mods}' \\ - --fixed_mods '${params.fixed_mods}' \\ - --num_cpus ${task.cpus} - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - MS2PIP: \$(conda list | grep "ms2pip" | awk 'NR==2 {print \$2}') - END_VERSIONS - """ -} From c9b5b0b7d6b819dbc871953191233108603c3a65 Mon Sep 17 00:00:00 2001 From: jonasscheid Date: Thu, 30 Nov 2023 12:37:46 +0000 Subject: [PATCH 09/15] fix linting --- .github/workflows/awsfulltest.yml | 3 --- CITATIONS.md | 8 +++++++- README.md | 4 ++-- assets/multiqc_config.yml | 4 ++-- 4 files changed, 11 insertions(+), 8 deletions(-) diff --git a/.github/workflows/awsfulltest.yml b/.github/workflows/awsfulltest.yml index 31969965..69011d63 100644 --- a/.github/workflows/awsfulltest.yml +++ b/.github/workflows/awsfulltest.yml @@ -15,9 +15,6 @@ jobs: steps: - name: Launch workflow via tower uses: seqeralabs/action-tower-launch@v2 - # TODO nf-core: You can customise AWS full pipeline tests as required - # Add full size test data (but still relatively small datasets for few samples) - # on the `test_full.config` test runs with only one set of parameters with: workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} diff --git a/CITATIONS.md b/CITATIONS.md index d2f1d683..96538f1c 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -16,24 +16,30 @@ - [Comet](https://pubmed.ncbi.nlm.nih.gov/26115965/) - > Eng JK, Hoopmann MR, Jahan TA, Egertson JD, Noble WS, MacCoss MJ. A Deeper Look into Comet—Implementation and Features. Journal of the American Society for Mass Spectrometry 26,11 1865-1874 (2015). doi: 10.1007/s13361-015-1179-x. PMID: 26115965 + > Eng JK, Hoopmann MR, Jahan TA, Egertson JD, Noble WS, MacCoss MJ. A Deeper Look into Comet—Implementation and Features. Journal of the American Society for Mass Spectrometry 26,11 1865-1874 (2015). doi: 10.1007/s13361-015-1179-x. PMID: 26115965 - [DeepLC](https://pubmed.ncbi.nlm.nih.gov/34711972/) + > Bouwmeester R, Gabriels R, Hulstaert N, Martens L, Degroeve S. DeepLC can predict retention times for peptides that carry as-yet unseen modifications. Nature Methods 18,11 1363-1369 (2021). doi:1 0.1038/s41592-021-01301-5. PMID: 34711972 - [MS²PIP](https://pubmed.ncbi.nlm.nih.gov/31028400/) + > Gabriels R, Martens L, Degroeve S. Updated MS²PIP web server delivers fast and accurate MS² peak intensity prediction for multiple fragmentation methods, instruments and labeling techniques. Nucleic Acids Res. 2019 Jul 2;47(W1):W295-W299. doi: 10.1093/nar/gkz299. PMID: 31028400; PMCID: PMC6602496. - [MS²Rescore](https://pubmed.ncbi.nlm.nih.gov/35803561/) + > Declercq A, Bouwmeester R, Hirschler A, Carapito C, Degroeve S, Martens L, Gabriels R. MS²Rescore: Data-Driven Rescoring Dramatically Boosts Immunopeptide Identification Rates. Mol Cell Proteomics. 2022 Aug;21(8):100266. doi: 10.1016/j.mcpro.2022.100266. Epub 2022 Jul 6. PMID: 35803561; PMCID: PMC9411678. - [Ionmob](https://pubmed.ncbi.nlm.nih.gov/37540201/) + > Teschner D, Gomez-Zepeda D, Declercq A, Łącki MK, Avci S, Bob K, Distler U, Michna T, Martens L, Tenzer S, Hildebrandt A. Ionmob: a Python package for prediction of peptide collisional cross-section values. Bioinformatics. 2023 Sep 2;39(9):btad486. doi: 10.1093/bioinformatics/btad486. PMID: 37540201; PMCID: PMC10521631. - [Percolator](https://pubmed.ncbi.nlm.nih.gov/31407580/) + > Halloran JT, Zhang H, Kara K, Renggli C, The M, Zhang C, Rocke DM, Käll L, Noble WS. Speeding Up Percolator. J Proteome Res. 2019 Sep 6;18(9):3353-3359. doi: 10.1021/acs.jproteome.9b00288. Epub 2019 Aug 23. PMID: 31407580; PMCID: PMC6884961. - [Mokapot](https://pubmed.ncbi.nlm.nih.gov/33596079/) + > Fondrie WE, Noble WS. mokapot: Fast and Flexible Semisupervised Learning for Peptide Detection. J Proteome Res. 2021 Apr 2;20(4):1966-1971. doi: 10.1021/acs.jproteome.0c01010. Epub 2021 Feb 17. PMID: 33596079; PMCID: PMC8022319. - [FRED2](https://pubmed.ncbi.nlm.nih.gov/27153717/) diff --git a/README.md b/README.md index 679be851..1a39e7db 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # ![nf-core/mhcquant](docs/images/nf-core-mhcquant_logo_light.png#gh-light-mode-only) ![nf-core/mhcquant](docs/images/nf-core-mhcquant_logo_dark.png#gh-dark-mode-only) [![GitHub Actions CI Status](https://github.com/nf-core/mhcquant/workflows/nf-core%20CI/badge.svg)](https://github.com/nf-core/mhcquant/actions?query=workflow%3A%22nf-core+CI%22) -[![GitHub Actions Linting Status](https://github.com/nf-core/mhcquant/workflows/nf-core%20linting/badge.svg)](https://github.com/nf-core/mhcquant/actions?query=workflow%3A%22nf-core+linting%22)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/mhcquant/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.XXXXXXX-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.XXXXXXX) +[![GitHub Actions Linting Status](https://github.com/nf-core/mhcquant/workflows/nf-core%20linting/badge.svg)](https://github.com/nf-core/mhcquant/actions?query=workflow%3A%22nf-core+linting%22)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/mhcquant/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.8427707-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.8427707) [![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A523.04.0-23aa62.svg)](https://www.nextflow.io/) [![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/) @@ -175,7 +175,7 @@ For further information or help, don't hesitate to get in touch on the [Slack `# ## Citations -If you use nf-core/mhcquant for your analysis, please cite it using the following doi: [10.5281/zenodo.1569909](https://doi.org/10.5281/zenodo.1569909) and the corresponding manuscript: +If you use nf-core/mhcquant for your analysis, please cite it using the following doi: [10.5281/zenodo.8427707](https://doi.org/10.5281/zenodo.8427707) and the corresponding manuscript: > **MHCquant: Automated and Reproducible Data Analysis for Immunopeptidomics** > diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index 04621a17..0c4c7e41 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -3,9 +3,9 @@ custom_logo_url: https://github.com/nf-core/mhcquant custom_logo_title: "nf-core/mhcquant" report_comment: > - This report has been generated by the nf-core/mhcquant + This report has been generated by the nf-core/mhcquant analysis pipeline. For information about how to interpret these results, please see the - documentation. + documentation. report_section_order: "nf-core-mhcquant-methods-description": order: -1000 From 8f0735234e3d1f8c766f4aecbbf0cf98879deca2 Mon Sep 17 00:00:00 2001 From: jonasscheid Date: Thu, 30 Nov 2023 12:41:59 +0000 Subject: [PATCH 10/15] remove deeplc and ms2pip imports --- workflows/mhcquant.nf | 2 -- 1 file changed, 2 deletions(-) diff --git a/workflows/mhcquant.nf b/workflows/mhcquant.nf index ff7751ea..0a548005 100644 --- a/workflows/mhcquant.nf +++ b/workflows/mhcquant.nf @@ -72,8 +72,6 @@ include { OPENMS_PEAKPICKERHIRES } from include { OPENMS_FILEFILTER } from '../modules/local/openms_filefilter' include { OPENMS_COMETADAPTER } from '../modules/local/openms_cometadapter' include { OPENMS_PEPTIDEINDEXER } from '../modules/local/openms_peptideindexer' -include { DEEPLC } from '../modules/local/deeplc' -include { MS2PIP } from '../modules/local/ms2pip' include { MS2RESCORE } from '../modules/local/ms2rescore' include { OPENMS_IDSCORESWITCHER } from '../modules/local/openms_idscoreswitcher' From 9f2c4d3d9ff5f6cc728f3882ba664be52fc93f79 Mon Sep 17 00:00:00 2001 From: jonasscheid Date: Thu, 30 Nov 2023 12:48:52 +0000 Subject: [PATCH 11/15] go back to actions/checkout@v2 --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index bd5f86ab..7c6cb5ba 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -58,7 +58,7 @@ jobs: tests: ["test_mokapot", "test_percolator", "test_ionannotator"] steps: - name: Check out pipeline code - uses: actions/checkout@v3 + uses: actions/checkout@v2 - name: Install Nextflow env: NXF_VER: ${{ matrix.NXF_VER }} From cc295843cef3745ac36831ec16ec8621e9fca259 Mon Sep 17 00:00:00 2001 From: jonasscheid Date: Thu, 30 Nov 2023 13:00:54 +0000 Subject: [PATCH 12/15] correct indentation --- modules/local/ms2rescore.nf | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/modules/local/ms2rescore.nf b/modules/local/ms2rescore.nf index 9ade4bf8..785cba97 100644 --- a/modules/local/ms2rescore.nf +++ b/modules/local/ms2rescore.nf @@ -25,11 +25,11 @@ process MS2RESCORE { """ ms2rescore_cli.py \\ - --psm_file $idxml \\ - --spectrum_path . \\ - --output_path ${prefix}.idXML \\ - --processes $task.cpus \\ - $args + --psm_file $idxml \\ + --spectrum_path . \\ + --output_path ${prefix}.idXML \\ + --processes $task.cpus \\ + $args cat <<-END_VERSIONS > versions.yml "${task.process}": From b0d2009ff727f903314facb103817a9d0fe11856 Mon Sep 17 00:00:00 2001 From: jonasscheid Date: Sat, 2 Dec 2023 17:54:00 +0000 Subject: [PATCH 13/15] rename cli argument in ms2rescore --- bin/ms2rescore_cli.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/ms2rescore_cli.py b/bin/ms2rescore_cli.py index a580ccf1..2cdc04d7 100755 --- a/bin/ms2rescore_cli.py +++ b/bin/ms2rescore_cli.py @@ -19,7 +19,7 @@ def parse_cli_arguments_to_config(**kwargs): for key, value in kwargs.items(): # Skip these arguments since they need to set in a nested dict of feature_generators - if key in ["ms2pip_model", "ms2_tolerance"]: + if key in ["ms2pip_model", "ms2_tolerance", "rng"]: continue elif key == "feature_generators": @@ -99,7 +99,7 @@ def rescore_idxml(input_file, output_file, config): help="Comma-separated list of feature generators to use (default: `ms2pip,deeplc`). See ms2rescore doc for further information", default="", ) -@click.option("-am", "--ms2pip_model", help="MS²PIP model (default: `Immuno-HCD`)", default="Immuno-HCD") +@click.option("-pipm", "--ms2pip_model", help="MS²PIP model (default: `Immuno-HCD`)", type=str, default="Immuno-HCD") @click.option( "-ms2tol", "--ms2_tolerance", help="Fragment mass tolerance [Da](default: `0.02`)", type=float, default=0.02 ) From ffdf5a103bee6e9c02c8fe6faec32a65bc17730f Mon Sep 17 00:00:00 2001 From: jonasscheid Date: Wed, 6 Dec 2023 00:42:01 +0000 Subject: [PATCH 14/15] filter out not supported peptidehits --- bin/ms2rescore_cli.py | 45 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 43 insertions(+), 2 deletions(-) diff --git a/bin/ms2rescore_cli.py b/bin/ms2rescore_cli.py index 2cdc04d7..be639b85 100755 --- a/bin/ms2rescore_cli.py +++ b/bin/ms2rescore_cli.py @@ -6,9 +6,14 @@ import importlib.resources import json import logging +from typing import List + +import pandas as pd from ms2rescore import rescore, package_data from psm_utils.io.idxml import IdXMLReader, IdXMLWriter +from psm_utils import PSMList +import pyopenms as oms logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") @@ -63,7 +68,7 @@ def parse_cli_arguments_to_config(**kwargs): return config -def rescore_idxml(input_file, output_file, config): +def rescore_idxml(input_file, output_file, config) -> None: """Rescore PSMs in an idXML file and keep other information unchanged.""" # Read PSMs reader = IdXMLReader(input_file) @@ -72,11 +77,47 @@ def rescore_idxml(input_file, output_file, config): # Rescore rescore(config, psm_list) + # Filter out PeptideHits within PeptideIdentification(s) that could not be processed by all feature generators + peptide_ids_filtered = filter_out_artifact_psms(psm_list, reader.peptide_ids) + # Write - writer = IdXMLWriter(output_file, reader.protein_ids, reader.peptide_ids) + writer = IdXMLWriter(output_file, reader.protein_ids, peptide_ids_filtered) writer.write_file(psm_list) +def filter_out_artifact_psms( + psm_list: PSMList, peptide_ids: List[oms.PeptideIdentification] +) -> List[oms.PeptideIdentification]: + """Filter out PeptideHits that could not be processed by all feature generators""" + num_mandatory_features = max([len(psm.rescoring_features) for psm in psm_list]) + new_psm_list = PSMList(psm_list=[psm for psm in psm_list if len(psm.rescoring_features) == num_mandatory_features]) + + # get differing peptidoforms of both psm lists + psm_list_peptides = set([next(iter(psm.provenance_data.items()))[1] for psm in psm_list]) + new_psm_list_peptides = set([next(iter(psm.provenance_data.items()))[1] for psm in new_psm_list]) + not_supported_peptides = psm_list_peptides - new_psm_list_peptides + + # no need to filter if all peptides are supported + if len(not_supported_peptides) == 0: + return peptide_ids + # Create new peptide ids and filter out not supported peptides + new_peptide_ids = [] + for peptide_id in peptide_ids: + new_hits = [] + for hit in peptide_id.getHits(): + if hit.getSequence().toString() in not_supported_peptides: + continue + new_hits.append(hit) + if len(new_hits) == 0: + continue + peptide_id.setHits(new_hits) + new_peptide_ids.append(peptide_id) + logging.info( + f"Removed {len(psm_list_peptides) - len(new_psm_list_peptides)} PSMs. Peptides not supported: {not_supported_peptides}" + ) + return new_peptide_ids + + @click.command() @click.option( "-p", "--psm_file", help="Path to PSM file (PIN, mzIdentML, MaxQuant msms, X!Tandem XML, idXML)", required=True From e9a752710916eb04cc0d7e04444f4f833831510e Mon Sep 17 00:00:00 2001 From: jonasscheid Date: Thu, 7 Dec 2023 13:36:44 +0000 Subject: [PATCH 15/15] bump ms2rescore version to official release --- modules/local/ms2rescore.nf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/local/ms2rescore.nf b/modules/local/ms2rescore.nf index 785cba97..20588197 100644 --- a/modules/local/ms2rescore.nf +++ b/modules/local/ms2rescore.nf @@ -2,10 +2,10 @@ process MS2RESCORE { tag "$meta.id" label 'process_high' - conda "bioconda::ms2rescore=3.0.0b1" + conda "bioconda::ms2rescore=3.0.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/ms2rescore:3.0.0b1--pyhdfd78af_1': - 'biocontainers/ms2rescore:3.0.0b1--pyhdfd78af_1' }" + 'https://depot.galaxyproject.org/singularity/ms2rescore:3.0.0--pyhdfd78af_0': + 'biocontainers/ms2rescore:3.0.0--pyhdfd78af_0' }" input: tuple val(meta), path(idxml), path(mzml), path(fasta)