Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add rules to characterize peptide sequences #7

Merged
merged 11 commits into from
Feb 22, 2024
38 changes: 37 additions & 1 deletion Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -339,7 +339,7 @@ rule combine_peptide_predictions:


################################################################################
## Charaterize & annotate predicted peptide sequences
## Compare against known peptides
################################################################################


Expand Down Expand Up @@ -390,6 +390,40 @@ rule diamond_blastp_peptide_predictions_against_peptipedia_database:
"""


################################################################################
## Charaterize & annotate predicted peptide sequences
################################################################################


rule run_deepsig:
"""
This rule uses deepsig to predict signal peptides in proteins using deep learning.
"""
input:
peptide=rules.combine_peptide_predictions.output.peptide,
output:
tsv=OUTPUT_DIR / "annotation/deepsig/deepsig.tsv",
conda:
"envs/deepsig.yml"
shell:
"""
deepsig -f {input} -o {output} -k euk
taylorreiter marked this conversation as resolved.
Show resolved Hide resolved
"""


rule characterize_peptides:
input:
peptide=rules.combine_peptide_predictions.output.peptide,
output:
tsv=OUTPUT_DIR / "annotation/characteristics/peptide_characteristics.tsv",
conda:
"envs/peptides.yml"
shell:
"""
python scripts/characterize_peptides.py {input.peptide} {output.tsv}
"""


################################################################################
## Target rule all
################################################################################
Expand Down Expand Up @@ -422,6 +456,8 @@ rule predict_cleavage:
rules.nlpprecursor.output.peptide,
rules.extract_deeppeptide_sequences.output.peptide,
rules.diamond_blastp_peptide_predictions_against_peptipedia_database.output.tsv,
rules.run_deepsig.output.tsv,
rules.characterize_peptides.output.tsv,


rule predict_nrps:
Expand Down
11 changes: 11 additions & 0 deletions envs/deepsig.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
channels:
- anaconda
- conda-forge
- bioconda
taylorreiter marked this conversation as resolved.
Show resolved Hide resolved
- defaults
dependencies:
- anaconda::python=3.8.16
- anaconda::numpy=1.23.5
- anaconda::biopython=1.78
- anaconda::tensorflow-gpu=2.2.0
- bioconda::deepsig=1.2.5
7 changes: 7 additions & 0 deletions envs/peptides.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- biopython=1.83
- peptides=0.3.1
93 changes: 93 additions & 0 deletions scripts/characterize_peptides.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
import argparse
import csv

import peptides
from Bio import SeqIO


def characterize_peptides(input_file, output_file):
"""
Processes a multi-FASTA file of peptides, calculates physicochemical properties and descriptors
for each peptide, and writes the results to an output TSV file.

This function uses the `peptides` library to calculate peptide properties like aliphatic index,
boman index, charge, hydrophobicity, instability index, isoelectric point, molecular weight, and
z-scales (lipophilicity, steric properties, electronic properties, etc.).
It assumes default arguments for all peptide measurements as defined in the `peptides` library.
For a comprehensive list of available measurements and their optional arguments, refer to the
`peptides` library documentation: https://peptides.readthedocs.io.

Parameters:
- input_file (str): Path to the input FASTA file containing amino acid sequences of peptides.
- output_file (str): Path to the output TSV file where the peptide properties will be written.

Each row in the output TSV file includes the peptide ID, sequence, and calculated properties.

Note: This function writes directly to the output file and does not return any value.
"""
with open(output_file, "w", newline="") as out_file:
tsv_writer = csv.writer(out_file, delimiter="\t")
tsv_writer.writerow(
[
"id",
"aliphatic_index",
"boman_index",
"charge",
"hydrophobicity",
"instability_index",
"isoelectric_point",
"molecular_weight",
"pd1_residue_volume",
"pd2_hydrophilicity",
taylorreiter marked this conversation as resolved.
Show resolved Hide resolved
"z1_lipophilicity",
"z2_steric_bulk_or_polarizability",
"z3_polarity_or_charge",
"z4_electronegativity_etc",
"z5_electronegativity_etc",
]
)

for record in SeqIO.parse(input_file, "fasta"):
peptide_sequence = peptides.Peptide(str(record.seq))
aliphatic_index = peptide_sequence.aliphatic_index()
boman_index = peptide_sequence.boman()
charge = peptide_sequence.charge()
hydrophobicity = peptide_sequence.hydrophobicity()
instability_index = peptide_sequence.instability_index()
isoelectric_point = peptide_sequence.isoelectric_point()
molecular_weight = peptide_sequence.molecular_weight()
physical_descriptors = peptide_sequence.physical_descriptors()
zscales = peptide_sequence.z_scales()
tsv_writer.writerow(
[
record.id,
aliphatic_index,
boman_index,
charge,
hydrophobicity,
instability_index,
isoelectric_point,
molecular_weight,
physical_descriptors[0],
physical_descriptors[1],
zscales[0],
zscales[1],
zscales[2],
zscales[3],
zscales[4],
]
)


def main():
parser = argparse.ArgumentParser(description="Characterize peptides from a multi-fasta file.")
parser.add_argument("input_file", type=str, help="Input multi-fasta file of amino acids")
parser.add_argument("output_file", type=str, help="Output TSV file to write the results")

args = parser.parse_args()

characterize_peptides(args.input_file, args.output_file)


if __name__ == "__main__":
main()
Loading