lint

Arcadia-Science · Feb 15, 2024 · 5ad5e33 · 5ad5e33
1 parent b876bdc
commit 5ad5e33
Show file tree

Hide file tree

Showing 3 changed files with 75 additions and 39 deletions.
diff --git a/Snakefile b/Snakefile
@@ -6,11 +6,12 @@ from pathlib import Path
 ################################################################################
 
 # Retrieves the absolute path of the directory snakemake is launched in.
-# Used by DeepPeptide to simplify output file paths. 
+# Used by DeepPeptide to simplify output file paths.
 WORKING_DIRPATH = Path(os.getcwd())
 
+
 # Default pipeline configuration parameters are in the config file.
-# If you create a new yml file and use the --configfile flag, 
+# If you create a new yml file and use the --configfile flag,
 # options in that new file overwrite the defaults.
 configfile: "./config.yml"
 
@@ -45,8 +46,8 @@ rule filter_nt_contigs_to_short:
         """
 
 
-# TER TODO: Add a rule for sORF prediction, either once smallesm is developed, 
-#           when there is an accurate sORF rnasamba model, 
+# TER TODO: Add a rule for sORF prediction, either once smallesm is developed,
+#           when there is an accurate sORF rnasamba model,
 #           or using another tool from Singh & Roy.
 
 

diff --git a/scripts/extract_deeppeptide_sequences.py b/scripts/extract_deeppeptide_sequences.py
@@ -1,8 +1,9 @@
 import argparse
 import json
-import sys
+
 from Bio import SeqIO
 
+
 def read_fasta(fasta_file):
     """Read a FASTA file using BioPython and return a dictionary of sequences."""
     sequences = {}
@@ -11,21 +12,23 @@ def read_fasta(fasta_file):
     return sequences
 
 
-def extract_peptide_sequences(data, fasta_file, proteins_output_file, peptides_output_file):
+def extract_peptide_sequences(
+    data, fasta_file, proteins_output_file, peptides_output_file
+):
     """
     Extract gene and peptide sequences based on the data dictionary and FASTA file,
     then write to separate files.
 
     Extracts protein and peptide sequences based on the provided data dictionary and a FASTA file.
     The data object is created from a JSON file output by deeppeptide.
-    The protein sequences are extracted from the FASTA file using the IDs found in the data dictionary.
-    Peptide sequences are then extracted from these protein sequences based on start and end 
-    positions specified for each peptide within the data dictionary. 
+    The protein sequences are extracted from the FASTA file using IDs in the data dictionary.
+    Peptide sequences are then extracted from these protein sequences based on start and end
+    positions specified for each peptide within the data dictionary.
     The extracted protein and peptide sequences are written to separate output files.
 
     Parameters:
-    - data (dict): A dictionary containing prediction data, where each key is a protein ID and 
-      the associated value is another dictionary with details including peptides' start and end 
+    - data (dict): A dictionary containing prediction data, where each key is a protein ID and
+      the associated value is another dictionary with details including peptides' start and end
       positions.
     - fasta_file (str): The path to a FASTA file containing protein sequences.
       This should be the same file used to make the DeepPeptide predictions.
@@ -51,7 +54,9 @@ def extract_peptide_sequences(data, fasta_file, proteins_output_file, peptides_o
     """
     sequences = read_fasta(fasta_file)
 
-    with open(proteins_output_file, "w") as proteins_out, open(peptides_output_file, "w") as peptides_out:
+    with open(proteins_output_file, "w") as proteins_out, open(
+        peptides_output_file, "w"
+    ) as peptides_out:
         for protein_key, protein_info in data["PREDICTIONS"].items():
             protein_id = protein_key.split()[0][1:]  # Extract the ID part
             peptides = protein_info.get("peptides")
@@ -73,19 +78,36 @@ def main(json_file, fasta_file, proteins_output_file, peptides_output_file):
     with open(json_file) as f:
         data = json.load(f)
 
-    extract_peptide_sequences(data, fasta_file, proteins_output_file, peptides_output_file)
+    extract_peptide_sequences(
+        data, fasta_file, proteins_output_file, peptides_output_file
+    )
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description='Extract peptide sequences from DeepPeptide JSON.')
+    parser = argparse.ArgumentParser(
+        description="Extract peptide sequences from DeepPeptide JSON."
+    )
 
     # Add the arguments
-    parser.add_argument('json_file', type=str, help='The JSON file output by DeepPeptide.')
-    parser.add_argument('fasta_file', type=str, help='The protein FASTA file input to DeepPeptide.')
-    parser.add_argument('proteins_output_file', type=str, help='The output file path for proteins.')
-    parser.add_argument('peptides_output_file', type=str, help='The output file path for peptides.')
+    parser.add_argument(
+        "json_file", type=str, help="The JSON file output by DeepPeptide."
+    )
+    parser.add_argument(
+        "fasta_file", type=str, help="The protein FASTA file input to DeepPeptide."
+    )
+    parser.add_argument(
+        "proteins_output_file", type=str, help="The output file path for proteins."
+    )
+    parser.add_argument(
+        "peptides_output_file", type=str, help="The output file path for peptides."
+    )
 
     # Execute the parse_args() method
     args = parser.parse_args()
 
-    main(args.json_file, args.fasta_file, args.proteins_output_file, args.peptides_output_file)
+    main(
+        args.json_file,
+        args.fasta_file,
+        args.proteins_output_file,
+        args.peptides_output_file,
+    )
diff --git a/scripts/run_nlpprecursor.py b/scripts/run_nlpprecursor.py
@@ -1,3 +1,4 @@
+import csv
 import sys
 import time
 from pathlib import Path
@@ -71,33 +72,45 @@ def main(models_dir, multifasta_file, output_tsv):
 
     # The output of nlpprecursor predictions are in JSON format.
     # The code below parses the JSON into a TSV format.
-with open(output_tsv, 'w', newline='\n') as file:
-    writer = csv.writer(file, delimiter='\t')
 
-    writer.writerow([
-        'name', 'class', 'class_score', 'cleavage_sequence', 'cleavage_start', 'cleavage_stop', 'cleavage_score'
-    ])
-
-    for ind, sequence in enumerate(sequences):
-        name = sequence['name']
-        class_pred = class_predictions[ind]['class_predictions'][0]
-        cleavage_pred = cleavage_predictions[ind]['cleavage_prediction']
+    with open(output_tsv, "w", newline="\n") as file:
+        writer = csv.writer(file, delimiter="\t")
+
+        writer.writerow(
+            [
+                "name",
+                "class",
+                "class_score",
+                "cleavage_sequence",
+                "cleavage_start",
+                "cleavage_stop",
+                "cleavage_score",
+            ]
+        )
 
-        writer.writerow([
-            name,
-            class_pred['class'],
-            class_pred['score'],
-            cleavage_pred['sequence'],
-            cleavage_pred['start'],
-            cleavage_pred['stop'],
-            cleavage_pred['score']
-        ])
+        for ind, sequence in enumerate(sequences):
+            name = sequence["name"]
+            class_pred = class_predictions[ind]["class_predictions"][0]
+            cleavage_pred = cleavage_predictions[ind]["cleavage_prediction"]
 
+            writer.writerow(
+                [
+                    name,
+                    class_pred["class"],
+                    class_pred["score"],
+                    cleavage_pred["sequence"],
+                    cleavage_pred["start"],
+                    cleavage_pred["stop"],
+                    cleavage_pred["score"],
+                ]
+            )
 
 
 if __name__ == "__main__":
     if len(sys.argv) != 4:
-        print("Usage: python run_nlpprecursor.py <models_dir> <multifasta_file> <output_tsv>")
+        print(
+            "Usage: python run_nlpprecursor.py <models_dir> <multifasta_file> <output_tsv>"
+        )
         sys.exit(1)
 
     models_dir = sys.argv[1]