From 5ad5e332cbe160efe64b90a3a2d4d0f8588c1223 Mon Sep 17 00:00:00 2001 From: Taylor Reiter Date: Thu, 15 Feb 2024 12:34:15 -0500 Subject: [PATCH] lint --- Snakefile | 9 ++-- scripts/extract_deeppeptide_sequences.py | 52 ++++++++++++++++------- scripts/run_nlpprecursor.py | 53 +++++++++++++++--------- 3 files changed, 75 insertions(+), 39 deletions(-) diff --git a/Snakefile b/Snakefile index aca8365..7c9f01c 100644 --- a/Snakefile +++ b/Snakefile @@ -6,11 +6,12 @@ from pathlib import Path ################################################################################ # Retrieves the absolute path of the directory snakemake is launched in. -# Used by DeepPeptide to simplify output file paths. +# Used by DeepPeptide to simplify output file paths. WORKING_DIRPATH = Path(os.getcwd()) + # Default pipeline configuration parameters are in the config file. -# If you create a new yml file and use the --configfile flag, +# If you create a new yml file and use the --configfile flag, # options in that new file overwrite the defaults. configfile: "./config.yml" @@ -45,8 +46,8 @@ rule filter_nt_contigs_to_short: """ -# TER TODO: Add a rule for sORF prediction, either once smallesm is developed, -# when there is an accurate sORF rnasamba model, +# TER TODO: Add a rule for sORF prediction, either once smallesm is developed, +# when there is an accurate sORF rnasamba model, # or using another tool from Singh & Roy. diff --git a/scripts/extract_deeppeptide_sequences.py b/scripts/extract_deeppeptide_sequences.py index 45713bd..976cd63 100644 --- a/scripts/extract_deeppeptide_sequences.py +++ b/scripts/extract_deeppeptide_sequences.py @@ -1,8 +1,9 @@ import argparse import json -import sys + from Bio import SeqIO + def read_fasta(fasta_file): """Read a FASTA file using BioPython and return a dictionary of sequences.""" sequences = {} @@ -11,21 +12,23 @@ def read_fasta(fasta_file): return sequences -def extract_peptide_sequences(data, fasta_file, proteins_output_file, peptides_output_file): +def extract_peptide_sequences( + data, fasta_file, proteins_output_file, peptides_output_file +): """ Extract gene and peptide sequences based on the data dictionary and FASTA file, then write to separate files. Extracts protein and peptide sequences based on the provided data dictionary and a FASTA file. The data object is created from a JSON file output by deeppeptide. - The protein sequences are extracted from the FASTA file using the IDs found in the data dictionary. - Peptide sequences are then extracted from these protein sequences based on start and end - positions specified for each peptide within the data dictionary. + The protein sequences are extracted from the FASTA file using IDs in the data dictionary. + Peptide sequences are then extracted from these protein sequences based on start and end + positions specified for each peptide within the data dictionary. The extracted protein and peptide sequences are written to separate output files. Parameters: - - data (dict): A dictionary containing prediction data, where each key is a protein ID and - the associated value is another dictionary with details including peptides' start and end + - data (dict): A dictionary containing prediction data, where each key is a protein ID and + the associated value is another dictionary with details including peptides' start and end positions. - fasta_file (str): The path to a FASTA file containing protein sequences. This should be the same file used to make the DeepPeptide predictions. @@ -51,7 +54,9 @@ def extract_peptide_sequences(data, fasta_file, proteins_output_file, peptides_o """ sequences = read_fasta(fasta_file) - with open(proteins_output_file, "w") as proteins_out, open(peptides_output_file, "w") as peptides_out: + with open(proteins_output_file, "w") as proteins_out, open( + peptides_output_file, "w" + ) as peptides_out: for protein_key, protein_info in data["PREDICTIONS"].items(): protein_id = protein_key.split()[0][1:] # Extract the ID part peptides = protein_info.get("peptides") @@ -73,19 +78,36 @@ def main(json_file, fasta_file, proteins_output_file, peptides_output_file): with open(json_file) as f: data = json.load(f) - extract_peptide_sequences(data, fasta_file, proteins_output_file, peptides_output_file) + extract_peptide_sequences( + data, fasta_file, proteins_output_file, peptides_output_file + ) if __name__ == "__main__": - parser = argparse.ArgumentParser(description='Extract peptide sequences from DeepPeptide JSON.') + parser = argparse.ArgumentParser( + description="Extract peptide sequences from DeepPeptide JSON." + ) # Add the arguments - parser.add_argument('json_file', type=str, help='The JSON file output by DeepPeptide.') - parser.add_argument('fasta_file', type=str, help='The protein FASTA file input to DeepPeptide.') - parser.add_argument('proteins_output_file', type=str, help='The output file path for proteins.') - parser.add_argument('peptides_output_file', type=str, help='The output file path for peptides.') + parser.add_argument( + "json_file", type=str, help="The JSON file output by DeepPeptide." + ) + parser.add_argument( + "fasta_file", type=str, help="The protein FASTA file input to DeepPeptide." + ) + parser.add_argument( + "proteins_output_file", type=str, help="The output file path for proteins." + ) + parser.add_argument( + "peptides_output_file", type=str, help="The output file path for peptides." + ) # Execute the parse_args() method args = parser.parse_args() - main(args.json_file, args.fasta_file, args.proteins_output_file, args.peptides_output_file) + main( + args.json_file, + args.fasta_file, + args.proteins_output_file, + args.peptides_output_file, + ) diff --git a/scripts/run_nlpprecursor.py b/scripts/run_nlpprecursor.py index b15bb8d..d667b23 100644 --- a/scripts/run_nlpprecursor.py +++ b/scripts/run_nlpprecursor.py @@ -1,3 +1,4 @@ +import csv import sys import time from pathlib import Path @@ -71,33 +72,45 @@ def main(models_dir, multifasta_file, output_tsv): # The output of nlpprecursor predictions are in JSON format. # The code below parses the JSON into a TSV format. -with open(output_tsv, 'w', newline='\n') as file: - writer = csv.writer(file, delimiter='\t') - writer.writerow([ - 'name', 'class', 'class_score', 'cleavage_sequence', 'cleavage_start', 'cleavage_stop', 'cleavage_score' - ]) - - for ind, sequence in enumerate(sequences): - name = sequence['name'] - class_pred = class_predictions[ind]['class_predictions'][0] - cleavage_pred = cleavage_predictions[ind]['cleavage_prediction'] + with open(output_tsv, "w", newline="\n") as file: + writer = csv.writer(file, delimiter="\t") + + writer.writerow( + [ + "name", + "class", + "class_score", + "cleavage_sequence", + "cleavage_start", + "cleavage_stop", + "cleavage_score", + ] + ) - writer.writerow([ - name, - class_pred['class'], - class_pred['score'], - cleavage_pred['sequence'], - cleavage_pred['start'], - cleavage_pred['stop'], - cleavage_pred['score'] - ]) + for ind, sequence in enumerate(sequences): + name = sequence["name"] + class_pred = class_predictions[ind]["class_predictions"][0] + cleavage_pred = cleavage_predictions[ind]["cleavage_prediction"] + writer.writerow( + [ + name, + class_pred["class"], + class_pred["score"], + cleavage_pred["sequence"], + cleavage_pred["start"], + cleavage_pred["stop"], + cleavage_pred["score"], + ] + ) if __name__ == "__main__": if len(sys.argv) != 4: - print("Usage: python run_nlpprecursor.py ") + print( + "Usage: python run_nlpprecursor.py " + ) sys.exit(1) models_dir = sys.argv[1]