-
Notifications
You must be signed in to change notification settings - Fork 0
/
annotate_REs.py
124 lines (93 loc) · 3.83 KB
/
annotate_REs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import sys
import os
import argparse
from multiprocessing import Pool
import pandas as pd
from tqdm import tqdm
import yaml
import json
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.Alphabet.IUPAC import IUPACAmbiguousDNA
from Bio.Restriction import RestrictionBatch, Analysis
from Bio.Restriction import AllEnzymes, CommOnly
from jakomics import utilities, colors, file
from jakomics.genome import GENOME
import jak_utils
print(f"{colors.bcolors.RED}This code is under development and probably not finished!{colors.bcolors.END}"))
# OPTIONS #####################################################################
parser=argparse.ArgumentParser(description = "", formatter_class = argparse.RawTextHelpFormatter)
parser.add_argument('--in_dir',
help = "Directory with faa files",
required = False,
default = "")
parser.add_argument('-f', '--files',
help = "Paths to individual faa files",
nargs = '*',
required = False,
default = [])
parser.add_argument('--out_dir',
help = "Directory to write results to",
required = True)
args=parser.parse_args()
# FUNCTIONS ###################################################################
def main(genome):
original_path=genome.file_path
if genome.suffix in ['.gb', '.gbk', '.gbff']:
gbk=GENOME(genome)
gbk.fa_path=genome.short_name + "_" + genome.id + ".fa"
gbk.genbank_to_fasta(write_contig = gbk.fa_path)
genome.file_path=gbk.fa_path
genome.temp_files['fa']=gbk.fa_path
output_file=os.path.join(args.out_dir, genome.name + '.REs.json')
if os.path.exists(output_file):
os.remove(output_file)
for seq_record in SeqIO.parse(genome.file_path, "fasta"):
a=cleanEnzymeList.search(seq_record.seq)
# print(a)
#
for b in a:
print(type(b))
# print(b, len(a[b]), sep="\t")
# a = Analysis(RestrictionBatch(cleanEnzymeList), seq_record.seq, False)
# print(type(a))
with open(output_file, 'w') as outfile:
json.dump(a, outfile, indent = 2)
#
# # write to file with comments
# f = open(output_file, 'a')
# for c in jak_utils.header(r=True):
# print(f'# {c}', file=f)
# print(f'# FILE: {original_path}', file=f)
# print(f'# DB: {jak_utils.get_yaml("kofam_db")}', file=f)
# print(f'# PROFILE: {args.profile}', file=f)
# print(f'# SCALE: {args.t_scale}', file=f)
#
# df.to_csv(f, sep="\t", index=False)
genome.remove_temp()
# MAIN LOOP ###################################################################
if __name__ == "__main__":
jak_utils.header()
# fix out directory
args.out_dir=os.path.abspath(args.out_dir) + '/'
if not os.path.exists(args.out_dir):
print("\nCreating directory " + args.out_dir)
os.makedirs(args.out_dir)
genome_list=utilities.get_files(args.files, args.in_dir, ["faa", "gbk", "gbff", "gb"])
if len(genome_list) == 0:
sys.exit(
f"{colors.bcolors.RED}Error: No valid files were found!{colors.bcolors.END}")
rawEnzymeList = RestrictionBatch(first=[], suppliers=['N'])
# rawEnzymeList = RestrictionBatch(first=[], suppliers=['B', 'N', 'K', 'R'])
print(len(rawEnzymeList))
cleanEnzymeListNames = []
for enzyme in rawEnzymeList:
if enzyme.size > 5: # recognition site at least 4 bases
if enzyme.cut_twice() == False:
cleanEnzymeListNames.append(enzyme)
cleanEnzymeList = RestrictionBatch(cleanEnzymeListNames)
print(len(cleanEnzymeList))
pool = Pool()
for _ in tqdm(pool.imap_unordered(main, genome_list), total=len(genome_list), desc="Finished", unit=" genomes"):
pass
pool.close()