forked from andor-halab1/FISHerMan
-
Notifications
You must be signed in to change notification settings - Fork 0
/
ncrnaParse.m
102 lines (87 loc) · 2.75 KB
/
ncrnaParse.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
function [Header,Sequence]=ncrnaParse(ncrna,seqData,trna,params)
% params = struct('species','Mouse','verbose',1,...
% 'dir1','C:\FISHerMan\Db\Mouse38.ncrna.fa',...
% 'tRNA',1,'dirT','C:\FISHerMan\Db\Mouse.trna.txt',...
% 'keys',{'ncrna','gene:\S*','gene_biotype:\S*'});
if params(1).verbose
disp('reading the ncrna data file');
end
[Header, Sequence] = fastaread(ncrna);
Header = Header';
Sequence = Sequence';
if params(1).verbose
disp(' trimming fasta headers');
end
for i = 1:length(params)
[pos1{i,1}, pos2{i,1}] = regexp(Header, params(i).keys, 'start', 'end');
end
for n = 1:length(Header)
temp = Header{n,1};
temp1 = temp(1:pos1{1,1}{n,1}-2);
temp2 = temp(pos1{2,1}{n,1}+5:pos2{2,1}{n,1});
temp3 = temp(pos1{3,1}{n,1}+13:pos2{3,1}{n,1});
if isempty(temp1)
disp('missing transcript ID');
quit;
elseif strfind(temp1,'.')
temp1pos=strfind(temp1,'.');
temp1=temp1(1:temp1pos(1)-1);
end
if isempty(temp2)
disp('missing gene ID');
quit;
elseif strfind(temp2,'.')
temp2pos=strfind(temp2,'.');
temp2=temp2(1:temp2pos(1)-1);
end
if isempty(temp3)
disp('missing gene type');
quit;
elseif strfind(temp3,'.')
temp3pos=strfind(temp3,'.');
temp3=temp3(1:temp3pos(1)-1);
end
Header{n,1} = strcat(temp1, ':', temp2, ':', temp3);
end
if ~isempty(seqData)
if params(1).verbose
disp(' picking expressed sequences according to RNA-seq data');
end
[Header, Sequence] = pickExpressedSeq(seqData, Header, Sequence);
end
if params(1).tRNA~=0
if params(1).verbose
disp(' appending additional tRNA sequences to ncrna data file');
end
[trnaHeader, trnaSequence] = fastaread(trna);
trnaHeader = trnaHeader';
trnaSequence = trnaSequence';
for n = 1:length(trnaHeader)
Header{end+1,1}=['ENSRNAT' num2str(n) ':ENSRNAG' num2str(n) ':tRNA'];
Sequence{end+1,1}=trnaSequence{n,1};
end
end
if params(1).verbose
disp('saving the ncrna fasta file');
end
ncrna = [params(1).species '.ncrna.fas'];
if exist(ncrna, 'file')
delete(ncrna);
end
fastawrite(ncrna, Header, Sequence);
if params(1).verbose
disp('generating ncrna database files for Blast');
end
ncrnaDb = [params(1).species '.ncrnaDb.fas'];
% MatLab's use of blastlocal requires short entry names
simpleHeader = Header;
for n = 1:length(Header)
pos = regexp(Header{n,1}, ':');
simpleHeader{n,1} = Header{n,1}(1:pos(1)-1);
end
if exist(ncrnaDb, 'file')
delete([ncrnaDb '*']);
end
fastawrite(ncrnaDb, simpleHeader, Sequence);
blastformat('Inputdb', ncrnaDb,...
'FormatArgs', '-o T -p F');