-
Notifications
You must be signed in to change notification settings - Fork 0
/
predictStruct.py
146 lines (135 loc) · 5.05 KB
/
predictStruct.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
# -*- coding: utf-8 -*-
"""
Created on Thu May 10 17:59:25 2018
@author: rmcp1g15
"""
import numpy as np
import random
dictd = []
class Structure(object):
def __init__(self, name, sequence, secondary):
self.name = name
self.sequence = sequence
self.secondary = secondary
aminoAcids = "ARNDCQEGHILKMFPSTWYVBZ"
labels = "HEC"
def getRepresentation(self, windowsSize):
half_size = int( (windowsSize - 1)/2 )
windowSequence = (("."*half_size) + self.sequence +("."*half_size))
X = []
Y = []
#shuffle numbers i to get random windowing
for i in random.sample(range(half_size,
len(self.sequence) + half_size), len(self.sequence)):
#matrix of 22*windowSize
Xi = np.zeros( (windowsSize, len(self.aminoAcids)), dtype=np.bool_)
current_seq = windowSequence[i-half_size: i+half_size+1]
for j in range(windowsSize):
if current_seq[j] != ".":
#mark true for the amino acid present in current window
ind = self.aminoAcids.index(current_seq[j])
Xi[j,ind] = True
X.append(Xi.flatten())
Yi = np.zeros(len(self.labels), dtype=np.bool_)
label_ind = self.labels.index(self.secondary[i-half_size])
#belonging class
Yi[label_ind] = True
Y.append(Yi.flatten())
X = np.array(X)
Y = np.array(Y)
return X, Y
def getStructs():
with open('ss.txt', "r") as f:
id = []
toAdd = ''
currentLine = f.readline()
id.append(currentLine.rstrip().split(':sequence')[0])
while len(id) < 782163:
currentLine = f.readline()
if currentLine.startswith(">"):
if toAdd != '':
id.append(toAdd)
toAdd = ''
if currentLine.rstrip().split(':')[2] == 'sequence' and len(id) + 1 < 782163:
id.append(currentLine.rstrip().split(':sequence')[0])
else:
toAdd += currentLine.rstrip('\n')
allStructs = []
with open('all.txt', "w") as f:
print(len(id))
addedToFile = 0
for i in range(0, len(id), 3):
if addedToFile < 150 and len(id[i+1]) > 20:
f.write(id[i]+"\n")
f.write(id[i+1]+"\n")
addedToFile += 1
second = (id[i+2]).replace("G", "H").replace("I","H").replace(" ", "C")
second = second.replace("X","C")
if "X" not in id[i+1]:
if " " not in id[i+1]:
if "U" not in id[i+1] and len(id[i+1]) > 20:
newStruct = Structure(id[i], id[i+1],
second.replace("B", "E").replace("T", "C").replace("S", "C"))
dictd.append(id[i+1])
dictd.append(second.replace("B", "E").replace("T", "C").replace("S", "C"))
allStructs.append(newStruct)
return allStructs
import os
import re
def cleanhtml(raw_html):
cleanr = re.compile('<.*?>')
cleantext = re.sub(cleanr, '', raw_html).strip()
return cleantext
def getTotalMismatches(seq1, seq2):
total = 0
if len(seq1) != len(seq2):
print('somethings wrong')
else:
for i in range(len(seq1)):
if seq1[i] != seq2[i]:
total += 1
return total
def getStructure(sequence, dictionary):
found = None
for s in dictionary:
if s.sequence == sequence:
print(found)
found = s
return found
def getResultsAccuracy(results):
dictionary = getStructs()
totalNum = 0
totalMismatches = 0
for struct in results:
foundStruct,=[x for x in dictionary if x.sequence == struct.sequence.strip()]
totalMismatches += getTotalMismatches(foundStruct.secondary, struct.secondary)
totalNum += len(struct.secondary)
return 1-(totalMismatches/totalNum)
allResults = []
other = []
rootdir = 'C:/Users/suki_/Documents/SecondaryStructure/results'
for subdir, dirs, files in os.walk(rootdir):
for file in files:
if file.endswith(".simple.html"):
path = os.path.join(subdir, file)
with open(path, "r") as f:
di = list(f)
name = path
seq = cleanhtml(di[5]).strip()
secondary = cleanhtml(di[6]).strip().replace('-','C')
newStruct = Structure(name, seq, secondary)
other.append(seq)
other.append(secondary)
allResults.append(newStruct)
dictionary = getStructs()
totalNum = 0
totalMismatches = 0
for struct in allResults:
foundStruct = getStructure(struct.sequence.strip(), dictionary)
if foundStruct != None:
print(True)
totalMismatches += getTotalMismatches(foundStruct.secondary, struct.secondary)
totalNum += len(struct.secondary)
accuracy = 1-(totalMismatches/totalNum)
print("Sample JPred Accuracy: ")
print(accuracy)