Skip to content

Commit

Permalink
lint
Browse files Browse the repository at this point in the history
  • Loading branch information
taylorreiter committed Feb 15, 2024
1 parent b876bdc commit 5ad5e33
Show file tree
Hide file tree
Showing 3 changed files with 75 additions and 39 deletions.
9 changes: 5 additions & 4 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,12 @@ from pathlib import Path
################################################################################

# Retrieves the absolute path of the directory snakemake is launched in.
# Used by DeepPeptide to simplify output file paths.
# Used by DeepPeptide to simplify output file paths.
WORKING_DIRPATH = Path(os.getcwd())


# Default pipeline configuration parameters are in the config file.
# If you create a new yml file and use the --configfile flag,
# If you create a new yml file and use the --configfile flag,
# options in that new file overwrite the defaults.
configfile: "./config.yml"

Expand Down Expand Up @@ -45,8 +46,8 @@ rule filter_nt_contigs_to_short:
"""


# TER TODO: Add a rule for sORF prediction, either once smallesm is developed,
# when there is an accurate sORF rnasamba model,
# TER TODO: Add a rule for sORF prediction, either once smallesm is developed,
# when there is an accurate sORF rnasamba model,
# or using another tool from Singh & Roy.


Expand Down
52 changes: 37 additions & 15 deletions scripts/extract_deeppeptide_sequences.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import argparse
import json
import sys

from Bio import SeqIO


def read_fasta(fasta_file):
"""Read a FASTA file using BioPython and return a dictionary of sequences."""
sequences = {}
Expand All @@ -11,21 +12,23 @@ def read_fasta(fasta_file):
return sequences


def extract_peptide_sequences(data, fasta_file, proteins_output_file, peptides_output_file):
def extract_peptide_sequences(
data, fasta_file, proteins_output_file, peptides_output_file
):
"""
Extract gene and peptide sequences based on the data dictionary and FASTA file,
then write to separate files.
Extracts protein and peptide sequences based on the provided data dictionary and a FASTA file.
The data object is created from a JSON file output by deeppeptide.
The protein sequences are extracted from the FASTA file using the IDs found in the data dictionary.
Peptide sequences are then extracted from these protein sequences based on start and end
positions specified for each peptide within the data dictionary.
The protein sequences are extracted from the FASTA file using IDs in the data dictionary.
Peptide sequences are then extracted from these protein sequences based on start and end
positions specified for each peptide within the data dictionary.
The extracted protein and peptide sequences are written to separate output files.
Parameters:
- data (dict): A dictionary containing prediction data, where each key is a protein ID and
the associated value is another dictionary with details including peptides' start and end
- data (dict): A dictionary containing prediction data, where each key is a protein ID and
the associated value is another dictionary with details including peptides' start and end
positions.
- fasta_file (str): The path to a FASTA file containing protein sequences.
This should be the same file used to make the DeepPeptide predictions.
Expand All @@ -51,7 +54,9 @@ def extract_peptide_sequences(data, fasta_file, proteins_output_file, peptides_o
"""
sequences = read_fasta(fasta_file)

with open(proteins_output_file, "w") as proteins_out, open(peptides_output_file, "w") as peptides_out:
with open(proteins_output_file, "w") as proteins_out, open(
peptides_output_file, "w"
) as peptides_out:
for protein_key, protein_info in data["PREDICTIONS"].items():
protein_id = protein_key.split()[0][1:] # Extract the ID part
peptides = protein_info.get("peptides")
Expand All @@ -73,19 +78,36 @@ def main(json_file, fasta_file, proteins_output_file, peptides_output_file):
with open(json_file) as f:
data = json.load(f)

extract_peptide_sequences(data, fasta_file, proteins_output_file, peptides_output_file)
extract_peptide_sequences(
data, fasta_file, proteins_output_file, peptides_output_file
)


if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Extract peptide sequences from DeepPeptide JSON.')
parser = argparse.ArgumentParser(
description="Extract peptide sequences from DeepPeptide JSON."
)

# Add the arguments
parser.add_argument('json_file', type=str, help='The JSON file output by DeepPeptide.')
parser.add_argument('fasta_file', type=str, help='The protein FASTA file input to DeepPeptide.')
parser.add_argument('proteins_output_file', type=str, help='The output file path for proteins.')
parser.add_argument('peptides_output_file', type=str, help='The output file path for peptides.')
parser.add_argument(
"json_file", type=str, help="The JSON file output by DeepPeptide."
)
parser.add_argument(
"fasta_file", type=str, help="The protein FASTA file input to DeepPeptide."
)
parser.add_argument(
"proteins_output_file", type=str, help="The output file path for proteins."
)
parser.add_argument(
"peptides_output_file", type=str, help="The output file path for peptides."
)

# Execute the parse_args() method
args = parser.parse_args()

main(args.json_file, args.fasta_file, args.proteins_output_file, args.peptides_output_file)
main(
args.json_file,
args.fasta_file,
args.proteins_output_file,
args.peptides_output_file,
)
53 changes: 33 additions & 20 deletions scripts/run_nlpprecursor.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import csv
import sys
import time
from pathlib import Path
Expand Down Expand Up @@ -71,33 +72,45 @@ def main(models_dir, multifasta_file, output_tsv):

# The output of nlpprecursor predictions are in JSON format.
# The code below parses the JSON into a TSV format.
with open(output_tsv, 'w', newline='\n') as file:
writer = csv.writer(file, delimiter='\t')

writer.writerow([
'name', 'class', 'class_score', 'cleavage_sequence', 'cleavage_start', 'cleavage_stop', 'cleavage_score'
])

for ind, sequence in enumerate(sequences):
name = sequence['name']
class_pred = class_predictions[ind]['class_predictions'][0]
cleavage_pred = cleavage_predictions[ind]['cleavage_prediction']
with open(output_tsv, "w", newline="\n") as file:
writer = csv.writer(file, delimiter="\t")

writer.writerow(
[
"name",
"class",
"class_score",
"cleavage_sequence",
"cleavage_start",
"cleavage_stop",
"cleavage_score",
]
)

writer.writerow([
name,
class_pred['class'],
class_pred['score'],
cleavage_pred['sequence'],
cleavage_pred['start'],
cleavage_pred['stop'],
cleavage_pred['score']
])
for ind, sequence in enumerate(sequences):
name = sequence["name"]
class_pred = class_predictions[ind]["class_predictions"][0]
cleavage_pred = cleavage_predictions[ind]["cleavage_prediction"]

writer.writerow(
[
name,
class_pred["class"],
class_pred["score"],
cleavage_pred["sequence"],
cleavage_pred["start"],
cleavage_pred["stop"],
cleavage_pred["score"],
]
)


if __name__ == "__main__":
if len(sys.argv) != 4:
print("Usage: python run_nlpprecursor.py <models_dir> <multifasta_file> <output_tsv>")
print(
"Usage: python run_nlpprecursor.py <models_dir> <multifasta_file> <output_tsv>"
)
sys.exit(1)

models_dir = sys.argv[1]
Expand Down

0 comments on commit 5ad5e33

Please sign in to comment.