-
Notifications
You must be signed in to change notification settings - Fork 1
/
cdnaParse.m
80 lines (66 loc) · 1.99 KB
/
cdnaParse.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
function [Header,Sequence]=cdnaParse(cdna,seqData,params)
% params = struct('species','Mouse','verbose',1,...
% 'dir1','C:\FISHerMan\Db\Mouse38.cdna.fa',...
% 'keys',{'cdna','gene:\S*'});
if params(1).verbose
disp('reading the cdna data file');
end
[Header, Sequence] = fastaread(cdna);
Header = Header';
Sequence = Sequence';
if params(1).verbose
disp(' trimming fasta headers');
end
for i = 1:length(params)
[pos1{i,1}, pos2{i,1}] = regexp(Header, params(i).keys, 'start', 'end');
end
for n = 1:length(Header)
temp = Header{n,1};
temp1 = temp(1:pos1{1,1}{n,1}-2);
temp2 = temp(pos1{2,1}{n,1}+5:pos2{2,1}{n,1});
if isempty(temp1)
disp('missing transcript ID');
quit;
elseif strfind(temp1,'.')
temp1pos=strfind(temp1,'.');
temp1=temp1(1:temp1pos(1)-1);
end
if isempty(temp2)
disp('missing gene ID');
quit;
elseif strfind(temp2,'.')
temp2pos=strfind(temp2,'.');
temp2=temp2(1:temp2pos(1)-1);
end
Header{n,1} = strcat(temp1, ':', temp2);
end
if ~isempty(seqData)
if params(1).verbose
disp(' picking expressed sequences according to RNA-seq data');
end
[Header, Sequence] = pickExpressedSeq(seqData, Header, Sequence);
end
if params(1).verbose
disp('saving the cdna fasta file');
end
cdna = [params(1).species '.cdna.fas'];
if exist(cdna, 'file')
delete(cdna);
end
fastawrite(cdna, Header, Sequence);
if params(1).verbose
disp('generating cdna database files for Blast');
end
cdnaDb = [params(1).species '.cdnaDb.fas'];
% MatLab's use of blastlocal requires short entry names
simpleHeader = Header;
for n = 1:length(Header)
pos = regexp(Header{n,1}, ':');
simpleHeader{n,1} = Header{n,1}(1:pos(1)-1);
end
if exist(cdnaDb, 'file')
delete([cdnaDb '*']);
end
fastawrite(cdnaDb, simpleHeader, Sequence);
blastformat('Inputdb', cdnaDb,...
'FormatArgs', '-o T -p F');