-
Notifications
You must be signed in to change notification settings - Fork 0
/
AssignMALDI_fun.py
1260 lines (1076 loc) · 44.4 KB
/
AssignMALDI_fun.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#This file conains data processing functions
#called from AssignMALDI_Win
##030821 Converting to python3x
import sys,os
sys.path.insert(1, r'./../functions') # add to pythonpath
from isotope_ed import isotope_fun
from bisect import bisect_left
import math
from pickpk_lmp import pickpk
from common import *
def convertcomp(comp):
#takes in chemical formula and makes a list in a specific order
#input composition is C20H38O11Na1
#output should be [38,20,(n),11,(s),0]
comp = comp.replace('\n','')
alist=[0,0,0,0,0,0]
order=['H','C','N','O','S','X']
for i,c in enumerate(order):
if c in comp:
num = getint(comp,comp.find(c)+1)
try: num = int(num)
except: print ('skipping '+num); pass
alist[i]=num
return alist
def getint(astr,i):
#Gets an int at or after i
#i is where to start in A23B22
#called from convertComp
val=''
while i<len(astr):
c=astr[i]
if c>='0' and c<='9':
val+=c
else:
try: val = int(val); break
except: return 0
i+=1
return val
def takeClosest(myList, myNumber):
"""
Assumes myList is sorted. Returns closest value to myNumber.
If two numbers are equally close, return the smallest number.
"""
#called from matchMasses
pos = bisect_left(myList, myNumber)
if pos == 0:
return myList[0],0
if pos == len(myList):
return myList[-1],len(myList)
before = myList[pos - 1]
after = myList[pos]
if after - myNumber < myNumber - before:
return after, pos
else:
return before,pos-1
def getMax(i4,i2,wh): #masses, intensities
#called from matchMasses
maxi=0;co=0
for r in range(0,len(i4[wh])):
if i2[wh][r]>maxi: maxi=i2[wh][r]; co=r
return co
def getIsoInts(i2,pki,wh=0):
#Converts iso %s to intensities based on first peak in picked series
#i2 = isotopic intensity list from a series
#pki=pk intensity from pk of interest
i2adj=[]
bpk=pki/i2[wh] #finds OF based on first peak (IS)/%
for per in i2: #isotopic intensities from one assignment
i2adj.append(bpk*per) #% * of = is
return i2adj
def teststart(ui):
pklist = processData([],ui,ui.infiles[0],0) #returns picked peaks
spath=os.path.dirname(os.path.realpath(sys.argv[0]))
nufile=os.path.join(spath,ui.pickedpks)
write2DList(nufile,pklist,'w')
return nufile
def subtractLists(alist,blist,limit=False):
#subtract b from a
#bstart tells where to start in b
#limit says to only subtract parts of list that exist if true
sublist=[]
for a,b in zip(alist,blist): sublist.append(a-b)
if len(alist)>len(blist) and limit==False: sublist+=alist[len(blist):]
if len(blist)>len(alist) and limit==False:
for b in blist[len(alist):]: sublist.append(-1*b)
return sublist
def firstOrBiggest(iints,alist,test=False):
#iints are isotopic intensities. alist is a list of adjusted intensities
#called from compareToPrediction
ria=[];mins=''
ia0 = getIsoInts(iints,alist[0],wh=0)#wh shifts up since we don't know the intensities before 0
ia0dif = subtractLists(alist,ia0)
#if test==True:print('alist',alist);print('ia0',ia0);print('dif',ia0dif)
asum=sum([abs(d) for d in ia0dif]) #total differences for each r
if not mins: mins=asum; ria=ia0 #save smallest differences
elif mins>asum: mins=asum;ria=ia0
#if test==True: print('asum0',asum)
im = alist.index(max(alist))
iam = getIsoInts(iints,alist[im],wh=im)#based on biggest peak
iamdif = subtractLists(alist,iam)
#if test==True:print('iam',iam[:5])
msum=sum([abs(d) for d in iamdif]) #totals absoluted differences for each r
if mins>msum: mins=msum;ria=iam
#if test==True: print('asumBig',asum,msum,mins)
return ria,mins
def compareToPrediction(pks,iints,X,Y,err):
#pks is peak series of interest = [[mass,area,XYiteratr,unadjusted area,times used]]
#iints=isotopic intensities from a series
#overlap is taken care of in pickpatterns::
# A rough iterwin by area is deleted from peak patterns
# before getting next peak pattern.
# the adjusted area is at ii=1 in pks while the original area is at ii=3
tf=False #testing flag
#if 1905<pks[0][0]<1906: tf=True
besti=[];ms=0
ys = [Y[pk[2]] for pk in pks] #gets actual intensities
yu = [pk[-1] for pk in pks] #gets whether used or not
if tf==True:
print ('ys',ys)
for pk in pks: print('>',pk[-1])
for e,yused in enumerate(yu[1:5]): #see if any peaks have no overlap
if tf==True: print('0',getIsoInts(iints,ys[0],wh=0))
if yused==1:
besti = getIsoInts(iints,ys[e+1],wh=e+1);
sublist=subtractLists(ys,besti,limit=True)
avg,std=astdev(sublist[1:5]) #c12 pk causing issues for higher mw
if tf==True: print('besti',besti);print('sublist',sublist)
if avg<1:besti=[]
if tf==True: print('avg',avg,std)
#print('using',pks[0][0],e+1,avg,len(besti));
break
if not besti:
yadj = [pk[1]/pk[3]*yi for yi,pk in zip(ys,pks)] #gets adjusted intensities
besti,ms=firstOrBiggest(iints,yadj,test=tf)
if tf==True: print('ys',ys);print('besti',besti)
area=0;isom=[];isoi=[]
for pk,bi in zip(pks,besti): #this will stop at shortest list
area+=bi/Y[pk[2]]*pk[3] #percent difference times calculated area of the whole peak
isom.append(pk[0])
isoi.append(bi) #this is just to make lists the same length below
#if tf==True: exit()
return area,isoi,isom
def matchMasses(X,Y,pks,isodm,infile,pltnum,ui):
#called from processData
#X=mass list from spectra
#Y=int list from spectra
#pks = picked = list of list of [mass,area,midpoint Xiter,unadjusted.area]
#isodm = isotopic patterns by mass [mass,[masslist,intenlist,name]]
#axl = list of plot axis
isodm.sort() #lowest mass first?
i1=[]; i2 = [] #separate into 2 lists
i3=[]
i4=[];i5=[];i6=[]
err=ui.error
ca=0
for a,b in isodm: #isotopic patterns
i1.append(a) #monoisotopic mass predicted
i2.append(b[1]) #intensities %s
i3.append(b[2]) #names
i4.append(b[0]) #masses
whm=getMax(i4,i2,ca) #returns ii to highest intensity
i5.append(whm) #where mass and intensity with most intensity in that set
i6.append(b[0][whm]) #biggest mass (centroid)
ca+=1
mX=[] #stores matches
mY=[]
labels=[]
#matches filter if first peak and 2nd peak
isowin=[]
isowhere=[]
totarea=[]
matches=[]
masses=[]
for i,pk in enumerate(pks): # pk = list of peaks in series
mass,ara,midi,yy,tu = pk[0] #[mass,area,midpoint Xiter,unadjarea,times used]
error=err
area=0;i2adj=[];isom=[]
#compares library masses (i1) to picked peak mass
answ = takeClosest(i1,mass) #close = value, where = iter in i1 & i2 &labels
close,where=answ #where is iosotope iter. Close is isotope mass
if where>=len(i1): continue #was causing error further down
go=False
if mass>3000: #mainly to avoid first of list problem, but blob pks start here
if 0<(mass-close)<3 and mass-pks[i-1][0][-1]>3: #pkmass is within 3 daltons higher than library mass and there's no close mass to the left
area,i2adj,isom=centerSearch(i2[where],i5[where],i6[where],pks[i],Y)
if area>0:go=False;True
if abs(mass-close)<=error: # and abs(X[start]-close)<=err: #found match
#need to get intensities at masses that match intensities in i2adj = i4[where]
ee=i-1;iw=[]
if i==0: ee=0;
area,i2adj,isom = compareToPrediction(pk,i2[where],X,Y,err)
go=True
if go==True:
if labels:
if i3[where]==labels[-1]: #same as before!
if area<totarea[-1]: continue #smaller than before
#else the new peak is larger - use it.
isowin[-1]=i2adj; isowhere[-1]=isom
mX[-1]=pk[0][0];mY[-1]=Y[pk[0][2]]
totarea[-1]=area
continue
isowin.append(i2adj) #list of predicted intensities
isowhere.append(isom) #list of predicted masses
mX.append(pk[0][0])#mass
mY.append(Y[pk[0][2]])#realInt
labels.append(i3[where])
totarea.append(area)
masses.append(mass)
nulabels,nuX,nuY,nutotarea,numasses,skips = sumMod(labels,mX,mY,totarea,masses,ui)#adds -32 pk intensity to main pk
nufile=output1(nulabels,nuX,nutotarea,infile,numasses) #goes to tables
ui.pfiles.append(output2(labels,mX,mY,infile,masses,isowhere,isowin,skips)) #holds processed data to insert into spectra
return nufile
def centerSearch(iints,iwh,bmi,alist,Y):
#what if blob peak or higher MW? Can I identify using biggest iterator?
#bmi = biggest mass of iterator. alist = ten pks close to search for mass;iwh=where mass is biggest in iso
#iints = iso int % intensities
#called from matchMasses
pks =[];maxi=0;ints=[];mi=0 #maxi=max intensity;ints=list of pkIntensities
#first get rid of extra peaks.
i=0
for pk in alist[1:]:
i+=1
if pk[0]-alist[i-1][0]<=2: #current pk mass - previous pk mass
pks.append(alist[i-1]) #cluster peaks within 2 daltons of each other
ints.append(Y[alist[i-1][2]]) #intensities
if ints[-1]>maxi: maxi=ints[-1]; mi=i-1 #save max int and location
else: break
if len(pks)<3: return 0,[],[] #want at least 3 peaks in our blob
#now see if bmi matches a peak
st=mi
if mi>0: st=mi-1
isoints=[]
for i,pk in enumerate(pks[st:mi+1]): # biggest peak and one on either side.
mass,ara,midi,yy,tu = pk #[mass,area,midpoint Xiter,unadjarea,times used]
if abs(mass-bmi)<0.5: #match to biggest peak
isoints=getIsoInts(iints,maxi,iwh)#isopercents,max pk intensity,which isoint to use
if not isoints: return 0,[],[] #nothing found
#add missing peaks based on isowin. need mass,int,area
ratio = pks[mi][1]/bmi #ratio of biggest pk area to biggest isoints
sub=iwh #assumes 1Da z. biggest isomass - iterator = starting mass
area=0;isom=[];isoi=[]
for bi in isoints[:10]: #cut off at 10
area+=bi*ratio #percent difference times calculated area of the whole peak
isom.append(bmi-sub);sub+=1
isoi.append(bi) #this is just to make lists the same length below
return area,isoi,isom
def matchAssignments(X,Y,pks,isodm,infile,pltnum,ui,alist):
#called from processData
#assignments already verified by user. Just need to get area and redo files.
#X=mass list from spectra
#Y=int list from spectra
#pks = picked = iterator in list (first,mid,last)..now list of list of [mass,area,midpoint Xiter]
#isodm = isotopic patterns by mass [mass,[masslist,intenlist,name]]
#alist = list of assignments [label,mX,masses,mY]
labels=[];mX=[];masses=[];mY=[]
for a,b,c,d in alist: #previously assigned pks
labels.append(a)
mX.append(b)
masses.append(c)
mY.append(d)
totint= [0]*len(mY)
error=ui.error
unass = [pk[0][0] for pk in pks] #list of picked,unassigned masses
for e,mass in enumerate(masses): #already picked pks
answ = takeClosest(unass,mass) #close = value, where = iter in i1 & i2 &labels
close,where=answ #where is pks iter. Close is unassigned mass
if abs(mass-close)<=error: # and abs(X[start]-close)<=err: #found match
ilist=[];area=0;maxa=0;maxi=0
for pip in pks[where]: #pks in series
area+=pip[1]
totint[e]=area
nulabels,nuX,nuY,nutotint,numasses,skips = sumMod(labels,mX,mY,totint,masses,ui)#adds -32 pk intensity to main pk
nufile=output1(nulabels,nuX,nutotint,infile,numasses) #prints off procpks.txt
return nufile
def updateAverages(ui):
#coming from saveChanges in plot window
#need to convert the new pks-gp files to pks.txt files, the call makeOutputFiles
#reprocess everything but force new assignments!
nufiles=[]
co=1
for f,p in zip(ui.infiles,ui.pfiles):
data=readFile(p,True,special='###') #name, foundmass,calcmass,int
data = sorted(data,key=lambda x:x[2])#by mass?
nufiles.append(processData(ui.glylist,ui,f,co,data))#returns procpks.txt file
co+=1
makeOutputFiles(ui,nufiles)
def getMaxY(X,Y,ui):
#pulls out highest Y between limits of X set by user
try: start=int(ui.mzstart); end=int(ui.mzend)
except: return max(Y)
maxy=0
for x,y in zip(X,Y):
if x>=start and x<=end:
if y>maxy: maxy=y
return maxy+(maxy*.05)
def subMod(ui,name):
#returns name without mod if mod is in it..Subtracts!
#only one mod should be in each name.
for mod in ui.mods: #things subtracted or added such as incomplete pm
ans=name.find(mod[0]) #returns index or -1
if ans>-1:
name=name[:ans]+name[ans+len(mod[0]):]
return name
def sumMod(labels,mX,mY,totarea,masses,ui):
#if a mod like -32, then need to add it into the results.
#for testing calculate an average % - if above, then adjust accordingly and indicate own assignment?
suml = []
nuL=[];nuX=[];nuI=[];nuY=[]; nuM=[]
for L,M,T,Y,mas in zip(labels,mX,totarea,mY,masses): #stick them all together so can sort
suml.append([L,M,T,Y,mas])
suml.sort() #by label
oldL=''; oldT='';oldM='';oldY=0; oldmas=0;skips=[];co=0
for L,M,T,Y,mas in suml:
co+=1
aname = subMod(ui,L)
if aname!=L: #something was subtracted, so mod exists.
if aname==oldL: #mod peak and parent peak exist
if T/oldT>0.5: print('check ',oldL,'! mod peak big',oldT,T)
oldT+=T #add mod to reg
else: #mod peak with no parent
skips.append(co)
else: #parent peak
if oldL:
if not nuL or oldL!=nuL[-1] and oldL!=L: #happens if modpeak without parent or if same assi twice
nuL.append(oldL); nuX.append(oldM); nuI.append(oldT);nuY.append(oldY);nuM.append(oldmas)
if L==oldL: #may be a repeat. Take biggest if is
if oldT>T: continue #skip that peak. Otherwise gets reset below
oldL=L
oldT=T
oldM=M
oldY=Y
oldmas=mas
#catch last
if not nuL or oldL!=nuL[-1]:
nuL.append(oldL); nuX.append(oldM); nuI.append(oldT); nuY.append(oldY);nuM.append(oldmas)
return nuL,nuX,nuY,nuI,nuM,skips
def output1(labels,mX,totint,infile,masses):
#output final masses including with -32pk added, for example.
outfile=getbase(infile) #returns infile without .txt
outfile += 'pks.txt' #adds pks.txt
fout = open(outfile,'w')
header = "Name\tFMass\tCMass\tIntensity\n"
t='\t'
fout.write(header)
for i,l in enumerate(labels):
line = l+t+'%.4f'%mX[i]+t+'%.4f'%masses[i]+t+'%.0f'%totint[i]+'\n'
fout.write(line)
fout.close()
return outfile
def output2(labels,mX,mY,infile,masses,isowhere,isowin,skips):
#outputs initial masses for use in graph. Name: '57-1pks-gp.txt'
#was going to a working directory, but if I want to save project for later, needs to go with data.
#skips = iterators of modded assignments without parents - so skip!
base=getbase(infile)
base += 'pks-gp.txt'
###May want to put a try statement here to avoid admin errors!
#if not os.path.exists(wdir): os.makedirs(wdir)
#outfile = os.path.join(wdir,base)
fout = open(base,'w')
header = "Name\tFMass\tCMass\tIntensity\n"
t='\t'
fout.write(header)
for i,l in enumerate(labels):
if i in skips: continue
line = l+t+'%.4f'%mX[i]+t+'%.4f'%masses[i]+t+'%.0f'%mY[i]+'\n'
fout.write(line)
fout.write("###\n")
co=-1
for ir,iw in zip(isowhere,isowin):
co+=1
if co in skips: continue
line = str(ir)+'\t'+str(iw)+'\n'
fout.write(line)
fout.close()
return base
def averageList(files,ui,Hex=False):
#from APL - edited to take in files [name,mass,int]
#list = expected [name,mass,???,[data1 name,mass,area,%abun],[data2 name,mass,area,%abun]]
#If Hex = false don't include Hex or sugs with HexNAc1 in natural abundance calc and don't output
#NOTE: When done, total percentage won't = 100 because average of all files. It should = 100 for individual files
nulist=[]
nudic={} #name:[mass,[inte]]
permonol={}
performl=[0,0,0,0] #store #without dhex or sa, #with just dhex, with just sa or with both
start,end = os.path.split(files[0])
start,end = os.path.split(start)
for f in files:
fin=open(f,'r')
namel=[]
massl=[]
intl=[]
intl2=[]
calmass=[]
#1. get data from file
for line in fin:
if 'Name' in line: continue #header
name,cmass,mass,inte=line.split()
###FILTERS for N-glycans!
if Hex==False:
ad = makeDictFromName(name)
if 'HexNAc' not in ad: continue
if 'HexNAc' in ad:
if ad['HexNAc']<2:continue
#if name=='HexNAc2': continue #throwing off % abundance
namel.append(name)
massl.append(float(mass))
intl.append(float(inte))
calmass.append(float(cmass)) #calculated mass #should be the same for same assignments
fin.close()
if not intl: continue
#2. calculate %abundance
totsum = sum(intl)
maxnum = max(intl)
perint=[]
for i,inte in enumerate(intl): #for each intensity
try:
intl2.append((inte/float(totsum))*100) #% abundance
perint.append((inte/float(maxnum))*100) #what about % of greatest
except: intl2.append(0); perint.append(0)
permono={'Hex':0,'HexNAc':0,'NeuAc':0,'dHex':0}#061718 for perMono
for i,n in enumerate(namel): #for each label
lets,nums = getSugNums(n)
for sug,num in zip(lets,nums): #i.e. hex,2
try: permono[sug]+=num*intl2[i]
except: pass
#for average
if n not in nudic:
nudic[n]=[calmass[i],[massl[i]],[intl2[i]],[perint[i]],[intl[i]]] #list of [mass,[%abun],[pergreatest],[percent area]]
else: #makes lists of %abun and %highest so can average below
nudic[n][1].append(massl[i])
nudic[n][2].append(intl2[i]) #add onto inte list
nudic[n][3].append(perint[i])
nudic[n][4].append(intl[i])
for k,v in permono.items(): #get lists of each type
if k not in permonol: permonol[k]=[]
permonol[k].append(v)
for name,alist in nudic.items(): #contains info from each file for each assignment
if len(files)==1:
nulist.append([name,myround(alist[0],2),myround(alist[1][0],2),
str(myround((alist[0]-alist[1][0])/(alist[0]*1e-6),1)),
alist[2][0],0,alist[3][0],0,1]+alist[4]) #alist[0]=mass
continue
if len(alist[1])<ui.minfile: continue
avg0,std0=astdev(alist[1]) #for found mass
avg1,std1=astdev(alist[2]) #for %abundance
avg2,std2=astdev(alist[3]) #for percent of highest
avg3,std3=astdev(alist[4]) #for area
delta = str(myround((avg0-alist[0])/(avg0*1e-6),1))
astr = str(myround(avg0,2))+' +/- '+str(myround(std0,2))
#calculate percent with each type-051320
if 'dHex' in name:
if 'NeuAc' in name: performl[3]+=avg1
else: performl[1]+=avg1
elif 'NeuAc' in name: performl[2]+=avg1
else: performl[0]+=avg1
nulist.append([name,myround(alist[0],2),astr,delta,avg1,std1,avg2,std2,len(alist[1])]+alist[4]) #alist[0]=mass
mono=[end] #mutant name
if permonol:
mono+=astdev(permonol['Hex'])+astdev(permonol['HexNAc'])+astdev(permonol['dHex'])+astdev(permonol['NeuAc'])
return nulist,[mono] #mono is [name, avg+std of hexoses, avg+std of N's, avg+std of D's, avg+std SA]
def getSugNums(astr):
#gets numbers from long hand list
#called from AverageList
lets=[]
nums=[]
co=0
tn='' #tempnum
tl='' #templet
for ch in astr:
if ch.isalpha() or ch=='*':
tl+=ch
if tn: nums.append(int(tn)); tn=''
else:
tn+=ch
if tl: lets.append(tl) ; tl=''
if tn:
try: anum = int(tn)
except: anum = getTotalNum(tn) #in case '-' in number -happens at end
nums.append(anum); tn='' #catch last
if tl: lets.append(tl) ; tl='' #shouldn't happen
return lets,nums
def astdev(alist,exp=0,twoD=False,dim=0):
#exp = what happens if find peaks in only one file but they should be in 3? exp says how many there should be and plugs in 0's to get a bigger stdev.
#case where want to add up all values in 2D list: dim='all'
blist=[]
if twoD==True: #Make a list of all positions at 'dim' spot in 2D list
for line in alist:
if dim=='all':
for i in line: blist.append(i)
else:
blist.append(line[dim])
alist=blist[:]
if exp==0: exp=len(alist)
for r in range(len(alist),exp):
alist.append(0)
average = findAvg(alist) #100716
total = 0.0
c=0
for i in alist:
total = total+((i-average)*(i-average))
c+=1
try: #12/07/16
stdev = math.sqrt(total/c)
except: stdev=0
return average,stdev
def findAvg(list):
#100716
c=0
total=0
for i in list:
total=total+float(i)
c+=1
if c==0: return 0
return total/c
def subtractBaseline(X,Y,baseline):
#baseline=[[x,y]] may not start on X. So use first or most recently passed bx as baseline
nuY=[]; eb=0; bx=baseline[eb][0];
for x,y in zip(X,Y):
if x>bx and eb<len(baseline)-1: eb+=1 #I'm assuming x will never skip, but bx might
by = baseline[eb][1]
nuv=y-by
if nuv<0: nuv=0
nuY.append(nuv)
return nuY
def findClosest(alist,val,wh):
#returns value and iterator in 2Dlist.
#wh is where in 2D list
ans=alist[min(range(len(alist)),key=lambda i: abs(alist[i][wh]-val))]
ii=alist.index(ans)
return ans,ii #ans is a list from the 2D list
def guessIsowinArea(mass,area):
##Calculate areas for isowin?
win=[]
win.append(1.0048*math.exp((-5.719e-4)*mass))
win.append(-0.7813+1.1228*math.exp(-0.5*(math.log(mass/1736.3154)/1.7195)**2))
win.append(0.0825+0.1694*math.exp(-0.5*(math.log(mass/3319.5970)/0.6294)**2))
win.append(0.0185+0.1949*math.exp(-0.5*(math.log(mass/5407.1746)/0.7066)**2))
win.append(0.0043+0.2093*math.exp(-0.5*(math.log(mass/8648.3239)/0.7315)**2))
#ma=max(win) #peak with maximum area
#ind = win.index(ma)
total = area/win[0] #assumes first peak in series
areas=[]
for w in win:
areas.append(w*(total))
return areas #, ind #list and index of max intensity
def pickPatterns(X,Y,pklist,derr,ca=True,perr=0.1):
#perr=in percent, derr=dalton error
#ca = calculate area. Only do first time
#baseline = [int]
#figured out slopes of each isotope mass vs. (int1-int2)/int1 = linear.
#can I use them to pick isotopic windows? Will iterate through list, subtracting windows each time till all picked.
#slopes up to five isotopes peak:(slope,y-intercept)
#092921 - issues with overlaps. try using isotopic wins to adjust areas.
areas=pklist
nuY=Y[:]#baseline
if ca==True:
areas = calcArea2(X,nuY,pklist) #returns mass,area,Xiterator(mid)
else: areas=pklist
plist=[] #pattern list
nua=[a for a in areas] #simple copy was affecting areas when nua changed
nc=0
isoareas=[]; maxlen=len(nua)
rejects=[] #for testing
while nc<maxlen: #for each peak
tf=False
#if 4314<nua[nc][0]<4315: tf=True
if tf==True: print('1070START',nua[nc])
x,a,it = nua[nc][:3] #mass, area,iter of picked peaks
if len(nua[nc])<4: nua[nc]+=[a,0] #adds area to 4th position and times used to 5th
if tf==True: print('MASS',nua[nc])
nd=nc; nc+=1
if a<=0: continue #no area, skip
series = []; temp=[];isoa=0
##Grab 10 picked peaks from list
temp=nua[nd:nd+11] #only first has yy at end.
##initialize series
series.append(temp[0]) #first peak to start
e=0;elist=[nd] #temp counter
while e <len(temp): #first 10 peaks in case of overlap - just need 5
xo=series[-1][0] #get mass of last peak added
##find closest peak 1 dalton away
ans,e=findClosest(temp,xo+1,wh=0) #+1= a dalton
if e==-1: break #not found
if tf==True: print ("error test",abs(ans[0]-(xo+1)),'vs',derr/2)
div=2
if xo>3000: div=1 #for blob peaks
if abs(ans[0]-(xo+1))>derr/div: break #more than a dalton away
if len(temp[e])==3: temp[e]+=[temp[e][1],0] #add area to fourth position and times used to 5th
series.append(temp[e])
elist.append(nd+e)
if tf==True: print('SERIES',len(series),series)
if len(series)<2: continue
##get isowin areas
isoareas = guessIsowinArea(x,a)#findBestIsowin(series)
silist=[s[1] for s in series] #areas
if tf==True:print('SC ',series[0][0],silist,'**',isoareas)
if slopeCheck(silist,isoareas)==False:
yints=[Y[s[2]] for s in series] #try it with intensities!
if tf==True: print('WITH INTS',yints)
if slopeCheck(yints,isoareas)==False:
if tf==True:print('1105 slopecheckfail')#,series[0][0],silist,'\n',isoareas)
continue
##go through series and adjust a as needed based on isowin
nuseries=[]
for e,s in enumerate(series):
x,a,it,aa,tu=s
if e<len(isoareas): #isoareas only 5 pks long
isoa = isoareas[e]
else:
if isoa<100: break #way down in noise
else: isoa=isoa/2 #will be small, just decrease it
ratio=1
if a!=0 and isoa!=0: ratio=min([a,isoa])/max([a,isoa])
tu+=1 #true if new series is longer than 1!
if isoa>=a or ratio>0.9:
nua[elist[e]]=[x,0,it,aa,tu] #used up whole area a,y stay the same
if a*0.9>isoa:
nua[elist[e]]=[x,a-isoa,it,aa,tu] #append what's left of a and y for next series
a=isoa
if tf==True:print('FINAL',x,a,isoa,aa,nua[elist[e]])
nuseries.append([x,a,it,aa,tu]) #append a and y of this pk/series
if tf==True: print('series',nuseries)
if len(nuseries)>1: plist.append(nuseries)
else: rejects.append(nuseries)
#need to update times used in stuff saved earlier
os=[0]#iterators to old series
for f,series in enumerate(plist[1:]): #for each series
for p1 in series:#for eack peak in series
poplist=[]
for e,ii in enumerate(os): #for each old series
if plist[ii][-1][0]<p1[0]:poplist.append(e);continue
for g,p2 in enumerate(plist[ii]): #for each pk in old series
if p2[2]==p1[2]:plist[ii][g][4]+=1;break #was used in next series
poplist.reverse()
for n in poplist: #go backwards since list is getting shorter!
os.pop(n)
os.append(f+1)
return plist
def slopeCheck(alist,blist):
#isotopic windows past 2K go up then down, make sure window has same pattern as peaks
#alist/blist = 1D lists of intensities. Assume blist is isowin, alist is data
#Can only test for unexpected negative slopes - overlap can cause unexpected upper ones
maxb=max(blist);mbi=blist.index(maxb)
maxa=max(alist);mai=alist.index(maxa)
if alist[0]==maxa and blist[0]==maxb: return True #both slope down
co=0;close=True
for a,b in zip(alist,blist): #slope in that fuzzy range - see if all pks w/n 10% of max
if abs(a-b)/maxa>0.1: close=False
co+=1
if co==max([mbi,mai]): break #limit comparison cause may be overlap
if close==True: return True
if mai<mbi: #alist goes down while blist still going up
if mai+1!=mbi: return False #possibly intensities of c12x and c12x+1 pks really close but c12x+1 should ==mbi
if len(alist)>mai+1:
if alist[mai+1]/alist[mai]>0.9 and blist[mbi-1]/blist[mbi]>0.9:
return True #c12x and c12x+1 close intensities
return False
else: mai=mbi
bslope = (maxb-blist[0])/(mbi+1) #treating X is 0,1,2,3,etc
aslope = (maxa-alist[0])/(mai+1) #+1 avoids 0
if bslope>0 and aslope>0: return True #both are positive
return False
def calcArea2(X,Y,pklist):
#Can I do this for broad peaks AND take into account gross overlap?
#datai = iterators to peaks
#range = all iterators to peaks in isotopic window
#make sure dataints not too far from iosints...will do later
alist=[] #stores mass,area,miditerator
for s,m,e in pklist: #start, middle, end
area=0
for ci in range(s,e+1): #iterators I hope
wid=X[ci+1]-X[ci]
ls=Y[ci];ll=Y[ci+1] #leg short, long leg
if ls>ll: ll=ls; ls=Y[ci+1]
area += (wid*ls)+(wid*(ll-ls)/2) #rectangle + triangle
alist.append([X[m],area,m])
return alist #mass,area, miditerator
def write2DList(fileout,alist,flag):
#if testFile(fileout)=="False": print "Could not open ",fileout; return
fout = open(fileout,flag)
for row in alist:
line=''
for i,item in enumerate(row):
try: line+=item
except: line+=str(item)
if i<len(row)-1:
line+='\t'
line+='\n'
fout.write(line)
print("Writing",fileout)
print("\nALL DONE!")
fout.close()
def orderList(data,ui):#glylist,useall='False'):
#order by mass and glylist. Apply filters to what is output
#if useall==True then include everything in lib even if not found
#if useall=allN include everything with >1HexNAc. someN=only assigned N-glycs
#data format = [name,mass,avgint,stdev]
#glylist format = [name,mass,chemicalformula]
glylist = ui.glylist
useall = ui.useout
if ui.oligos=='yes': #save N-glycans only
if ui.useout=='True': useall='allN' #outputs all assigned or unassigned N-glycans
else: useall='someN'
adict={} #store name and where for data
header=['sugar','calcmass','avgmass(Da)','delta(ppm)','%abun','std','%max','std','#files']
nulist=[header]
linelen=len(glylist[0])
data = sorted(data,key=lambda x:x[1])#by mass
glylist = sorted(glylist,key=lambda x:x[1])#by mass
dc=0;gc=0 #counters
dline=[]; gline=[]
while(dc<len(data) or gc<len(glylist)):
if dc<len(data):dline=data[dc];
if gc<len(glylist):gline=glylist[gc]
if dline[0]==gline[0]: #library name matches assignment
if dc<len(data): nulist.append(data[dc]) #
dc+=1; gc+=1; continue
if dline[1]<gline[1] or gc>=len(glylist): #an assignment not in library
if dc<len(data): nulist.append(data[dc]); dc+=1 #make sure just not at end of data
if gline[1]<dline[1] or dc>=len(data): #an assignment only in glylist
if useall=='someN' or useall=='allN': #print only expected N-glycans
name = gline[0];use=True
ad = makeDictFromName(name)
if 'HexNAc' not in ad: use=False
if 'HexNAc' in ad:
if ad['HexNAc']<2:use=False
if '-' in name: use=False #mods
#if name=='HexNAc2': use=False #throwing off % abundance
if use==False: gc+=1; continue
if useall=='True' or useall=='allN':
nulist.append([gline[0]]+[0]*linelen); #prints blanks
gc+=1
return nulist
def getglylist(libf):
#list of longname, mass, chemistry
glylist=[]
fin = open(libf,'r')
for line in fin:
glylist.append(line.split())
fin.close()
glist = [[a,float(b),c] for a,b,c in glylist] #get floats
glylist =sorted(glist,key=lambda x:x[1])
return glylist
def testmod(ui,name):
#Mods are things like missed permethylation
for mod in ui.mods:
if mod[0] in name: return False
return True
def getBuildingBlocks(ui,limit=750):
#for GUESSING assignments
#limit is maximum daltons of block
bbdic={};err=.00005
co=0;
if not ui.glylist: ui.glylist=getglylist(ui.libfile) #might happen when open previous
glist = ui.glylist #[[a,float(b),c] for a,b,c in ui.glylist] #library[name,mass,chem]
while(co<len(glist)):
if testmod(ui,glist[co][0])==False: co+=1;continue #is mod in name?
sdic=makeDicFromName(glist[co][0],ui)
smass = float(glist[co][1])
schem = makeDicFromName(glist[co][2],'') #puts atoms in dict, not sugs
if len(sdic)==1: co+=1; continue #skip hexoses
for a,b,c in glist[co+1:]: #name,mass,chem #all but first
b=float(b)
if b-smass>limit: continue
cdic=makeDicFromName(a,ui)
if len(cdic)==1: continue #skip hexoses
dicd=subtractDict_neg(cdic,sdic) #subtracts lower mass dic from d1
ans=convertDicToString_special(dicd,ui.order) #using order to make sure can compare in dic
if '-' in ans: continue
if ans not in bbdic:
bbdic[ans]=[b-smass, subtractDict_neg(makeDicFromName(c,''),schem)] #mass and chem diffs
co+=1
ui.blocks=[]
for k,v in bbdic.items():
ui.blocks.append([v[0],k,v[1]]) #mass,name,chem-dic
ui.blocks.sort()
def getOrder(glylist):
#gets user preferences for order and monosaccharides from incoming library
#should be before monos (-14D) is added.
order={} #holds mononame and order
for name,mass,CF in glylist:
#make list from name
alist=convertStringToList(name)
alist.reverse() #gives 2, HexNac, 5 Hex
co=-1
for part in alist:
if isnum(part): continue #skip numbers
co+=1
parts=part.split('*') #in case tag
sug=parts[-1] #because tag sticks to the following sugar (*pp*Hex)
if not sug: continue #tag was on end of sugar so gives ''
if sug not in order: order[sug]=co
else:
if order[sug]<co: order[sug]=co #will adjust co up only
#now convert to list
olist=[];r=0
while len(olist)<len(order):
for k,v in order.items():
if r==v: olist.append(k)
r+=1
olist.reverse()
return olist
def makeDicFromName(name,ui):
#takes in Hex5HexNAc2. Returns {'hex':5,'hexNac:2}
#tags come after * and are repeated instead of numbered *aa*, *ab*. NeuAc2*ab*Hex4
#labels will be NeuAc*a, NeuAc*b - have to split count.
#ui.torder is an attempt to figure out order of monos in labels from library instead of user question.
def addToDic(astr,anum,ui):
if anum:
if astr not in adic: adic[astr]=0
adic[astr]=int(anum);
astr=''; anum=''
return astr,anum
####################
adic={}
astr='';anum='';tagf=False;tags={}
if ui:
for mod in ui.mods: #things subtracted or added such as incomplete pm
ans=name.find(mod[0]) #returns index or -1
if ans>-1:
addToDic(mod[0],1,ui)
name=name[:ans]+name[ans+len(mod[0]):]
for ch in name:
if ch=='*': #will catch prior to isnumeric.
if tagf==False: tagf=True; continue #skip first '*'
if tagf==True: #flags collected, now do something with them
tagf=False; #set for next time
tot=int(anum) #keep track of total tags added cause some tags may be either/or
for tag,num in tags.items():
t1,t2 = addToDic(astr+'*'+tag,num,ui) #this will add ie. NeuAc*p
tot-=num
if tot>0: t1,t2=addToDic(astr,tot,ui)
astr='';anum='';tags={}
continue #skip '*'
if ch.isalpha():
if tagf==True:
if ch not in tags: tags[ch]=0
tags[ch]+=1
continue
astr,anum = addToDic(astr,anum,ui) #doesn't work if no anum
astr+=ch
if ch.isnumeric(): anum+=ch
astr,anum = addToDic(astr,anum,ui) #catch last if there
return adic
def convertDicToString_special(adic,order):
#order is order of keys. Adic=name:num
#special to handle tags...NeuAc*h and NeuAc become NeuAc2*h*
#if negative though, tags won't work, so reject negatives.
astr=''
td = adic.copy()
for o in order: #for each monosaccharide (no labels)
num=0;tags='*'
for s,n in td.items(): #oligosaccharide dictionary (with labels)
if n<0: return '-'
parts=s.split('*') #in case of labels
if o==parts[0]:
if len(parts)>1: tags+=parts[1]*n #catches tag.
num+=n #catches total atoms