-
Notifications
You must be signed in to change notification settings - Fork 0
/
gff3filter.py
52 lines (34 loc) · 1.34 KB
/
gff3filter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
#!/usr/bin/python
# - - - - - H E A D E R - - - - - - - - - - - - - - - - -
# Todd D. Yoder
#
# Purpose: Filter GFF3 file to a single gene of interest
# #Modules/Libraries:--------------------------------------------------------------------------------------------------------------
import sys
import argparse
import pandas as pd
parser = argparse.ArgumentParser(description="Filter GFF3 by gene of interest")
parser.add_argument('-p','--path', help='path to GFF3 file...', required = True)
parser.add_argument('-f','--file', help='name of GFF3 file...[TSV]', required = True)
parser.add_argument('-q','--query', help='gene of interest...', required = True)
args = vars(parser.parse_args())
path = args['path']
file = args['file']
GOI = args['query']
check = []
with open(path + file) as myfile:
for line in myfile:
if line.startswith('#'):
pass
else:
line=line.strip('\n')
check.append(line)
gff = pd.DataFrame([sub.split("\t") for sub in check])
query = gff[gff.iloc[:,8].str.contains(GOI)]
chrom = query[0].iloc[0]
start = query[3].iloc[0]
stop = query.groupby([0])[4].transform('last').iloc[0]
gff3 = gff[gff.iloc[:,0] == chrom]
gff3 = gff3[gff3.iloc[:,3].astype(int).between(int(start), int(stop), inclusive=True)]
print(gff3)
gff3.to_csv(GOI + ".gff3", sep='\t', index=False,header=False)