-
Notifications
You must be signed in to change notification settings - Fork 0
/
repeatseq.R
59 lines (53 loc) · 1.59 KB
/
repeatseq.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# repeatqeq is a function that screens DNA sequences for interspersed repeats
# and low complexity DNA sequences. The output of the program is a detailed annotation
# of the repeats that are present in the query sequences as well as the number of
# duplicates in the query sequences.
repeatseq = function(str,len=c(1,2,3,4,5)){
str=as.character(str)
lista=strsplit(str,"")
tot=length(lista)
n_len=length(len)
dupl_matrix=matrix(0,nrow=tot,ncol=n_len)
for(h in 1:n_len){
for(j in 1:tot){
gene=lista[[j]]
n=length(gene)-len[h]+1
dupl=NULL
for(i in 1:n){
a=i
a1=a+len[h]-1
primer=gene[a:a1]
dupl[i]=0
while(a<=n & a>0){
if(all(gene[a:a1]==primer)){
dupl[i]=dupl[i]+1
a=a+len[h]
a1=a1+len[h]
}else{
a=-1
}
}
}
dupl_matrix[j,h]=max(dupl)
}
}
colnames(dupl_matrix)=as.vector(len)
rownames(dupl_matrix)=str
t=table(str)
duplicated=t[str]
dupl_matrix=cbind(dupl_matrix,duplicated)
dupl_matrix
}
str=c("CGCGCGCGCGCGCGCGCGCGCGATGTACTT",
"TGACCTCTGGAATCGAGGTCAG",
"TGTTGTTGTTGTTGTTGTTGTTGTTGTTGT",
"TGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCA",
"CCTTCCTTCCTTCCTTCCTTCCTTCCTT",
"ACGTTGACGGACCTAACTGACTGCCGTGACCTTAC",
"ACGTAACGTAACGTAACGTAACGTAACGTAACGTAACGTAACGTAACGTA",
"AAATTTAAATTTAAATTTAAATTTAAATTTAAATTTAAATTTAAATTT",
"TGAACCTATTAGGCCAATTTTAACCCGTACCAATGCCGGGGT",
"TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT",
"AAGTGCGCGGATCAC",
"AAGTGCGCGGATCAC")
repeatseq(str)