This repository has been archived by the owner on May 31, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
fparse.py
126 lines (106 loc) · 3.97 KB
/
fparse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#!/usr/bin/env python3
import fobjects
import re
#parses the SnpEff annotated file. Only pulls out the prediction for the first transcript. Returns a list of variant objects with attributes id, loc, gene, gt, vartype, sepred. (rs, anpred, sig, and dn are None)
def SEparse(path):
varlist = []
id = 0
for line in open(path):
if not line.startswith("#"):
cols = line.rstrip().split("\t")
alts = cols[4].split(",")
for i in alts:
loc = cols[0] + ">" + cols[1] + ":" + cols[3] + "/" + i
info = cols[7].split("|")
sevartype = info[1]
sepred = info[2]
gene = info[3]
h= cols[9].split(":")
gt = h[0]
id += 1
var = fobjects.variant(id=id, loc=loc, gene=gene, gt=gt, sevartype=sevartype, sepred=sepred)
varlist.append(var)
return varlist
#parses the ANNOVAR annotated file. Only pulls out the prediction from MutationTaster. Returns a list of variant objects with attributes id, loc, gene, gt, rs, vartype, anpred. (sepred, sig, and dn are None) If an attribute is missing it is None.
def ANparse(path):
varlist = []
id = 0
for line in open(path):
if not line.startswith("#"):
cols = line.rstrip().split("\t")
alts = cols[4].split(",")
for i in alts:
loc = cols[0] + ">" + cols[1] + ":" + cols[3] + "/" + i
h = cols[9].split(":")
gt = h[0]
info = cols[7]
m = re.search("Func.refGene=([^;]+);", info)
if m:
anvartype = m.group(1)
else:
anvartype = "."
m2 = re.search("avsnp147=([^;]+);", info)
if m2:
rs = m2.group(1)
else:
rs = "."
m3 = re.search("Gene.refGene=([^;]+);", info)
if m3:
gene = m3.group(1)
else:
gene = "."
m4 = re.search("MutationTaster_pred=([^;]+);", info)
if m4:
anpred = m4.group(1)
else:
anpred = "."
id += 1
var = fobjects.variant(id=id, loc=loc, gene=gene, gt=gt, rs=rs, anvartype=anvartype, anpred=anpred)
varlist.append(var)
return varlist
#parses the CLINVAR annotated file. Returns a list of variant objects with attributes id, loc, vartype, sig, dn. (gene, gt, sepred, and anpred are None) If an attribute is missing it is None.
def CLINparse(path):
varlist = []
id = 0
for line in open(path):
if not line.startswith("#"):
#id is assigned based on line number so that each variant will have a unique id in the database
id += 1
cols = line.rstrip().split("\t")
loc = cols[0] + ">" + cols[1] + ":" + cols[3] + "/" + cols[4]
info = cols[7]
m = re.search("CLNVC=([^;]+);", info)
if m:
clinvartype = m.group(1)
else:
clinvartype = "."
m2 = re.search("CLNSIG=([^;]+);", info)
if m2:
sig = m2.group(1)
else:
sig = "."
m3 = re.search("CLNDN=([^;]+);", info)
if m3:
dn = m3.group(1)
else:
dn = "."
var = fobjects.variant(id=id, loc=loc, clinvartype=clinvartype, sig=sig, dn=dn)
varlist.append(var)
return varlist
"""
file = "22snpeff.vcf"
variants = SEparse(file)
for i in variants:
print(i.loc)
file2 = "22annovar.vcf"
variants2 = ANparse(file2)
for i in variants2:
i.format_gt(i.gt)
i.format_anpred(i.anpred)
print(i.gt)
print(i.anpred)
file3 = "clinvar.vcf"
variants3 = CLINparse(file3)
for i in variants3:
print(i.loc)
"""