-
Notifications
You must be signed in to change notification settings - Fork 19
/
selectionStats.py
executable file
·165 lines (141 loc) · 5.38 KB
/
selectionStats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
#!/usr/bin/env python
import sys
import os
import argparse
import glob
import egglib
# This script reads in a fasta alignment or a directory of alignments
# and a list of outgroup sequences. Alignments in directory must end in .fasta.
# Alignments must also be in frame coding sequence.
# It outputs population genetic statistics for including theta, pi,
# piN, piS, Tajima's D, and MK test table for each outgroup provided.
# This script requires egglib installed with the Bio++ libraries
class FullPaths(argparse.Action):
"""Expand user- and relative-paths"""
def __call__(self, parser, namespace, values, option_string=None):
setattr(namespace, self.dest,
os.path.abspath(os.path.expanduser(values)))
def listdir_fullpath(d):
return [os.path.join(d, f) for f in os.listdir(d)]
def is_dir(dirname):
"""Checks if a path is a directory"""
if not os.path.isdir(dirname):
msg = "{0} is not a directory".format(dirname)
raise argparse.ArgumentTypeError(msg)
else:
return dirname
def is_file(filename):
"""Checks if a file exists"""
if not os.path.isfile(filename):
msg = "{0} is not a file".format(filename)
raise argparse.ArgumentTypeError(msg)
else:
return filename
def get_arguments():
"""Parse command line arguments"""
parser = argparse.ArgumentParser(
description="Calculate diversity and selection statistic")
input_files = parser.add_mutually_exclusive_group(required=True)
input_files.add_argument(
'-a',
'--alignment',
help='Alignment to calculate statistics',
type=is_file)
input_files.add_argument(
'-d',
'--directory',
help='Directory of alignments',
type=is_dir)
parser.add_argument(
'-f',
'--frame',
help='Alignment is in correct reading frame',
action='store_true')
parser.add_argument(
'-o',
'--outgroup',
help='Outgroup(s) for McDonald-Kreitman Test',
type=str,
nargs='+')
return parser.parse_args()
def replace_stop(sequence):
codons = [sequence[i:i+3] for i in range(0, len(sequence), 3)]
stop_codons = ['TAA', 'TAG', 'TGA', 'taa', 'tag', 'tga']
codons = [x if x not in stop_codons else '---' for x in codons]
new_sequence = ''.join(codons)
return new_sequence
def calc_stats(alignment, outgroup):
statDict = {}
a = egglib.Align(alignment)
for i in range(a.ns()):
a.sequence(i, sequence=a.sequence(i).upper())
if args.frame:
a.sequence(i, sequence=replace_stop(a.sequence(i)))
polyDict = a.polymorphism()
statDict['theta'] = polyDict['thetaW']
statDict['pi'] = polyDict['Pi']
statDict['tajimaD'] = polyDict['D']
if args.frame:
if len(a.sequence(1)) % 3 != 0:
print("The following alignment is not in frame:")
print(alignment)
return {}
polyDictBPP = a.polymorphismBPP(dataType=4)
statDict['piN'] = polyDictBPP['PiNS']
statDict['piS'] = polyDictBPP['PiS']
statDict['NS'] = polyDictBPP['NSsites']
statDict['S'] = polyDictBPP['Ssites']
if outgroup is not None:
for o in outgroup:
temp_a = a.extract(0, len(a.sequence(1)) - 3) # remove stop codon
otherOutgroups = outgroup[:]
otherOutgroups.remove(o)
for otherOutgroup in otherOutgroups:
index = temp_a.find(otherOutgroup, strict=False)
if index is not None:
del temp_a[index]
try:
temp_a.group(temp_a.find(o, strict=False), group=999)
except IndexError:
print("The following outgroup is not present in alignment")
print(o, a.find(o, strict=False))
sys.exit()
polyDictBPP = temp_a.polymorphismBPP(dataType=4)
statDict['MK_'+o] = polyDictBPP['MK']
statDict['NI_'+o] = polyDictBPP['NI']
return statDict
def write_outfile(alignDict, outgroup):
outfile = open('selectionStats.txt', 'w')
outfile.write('Alignment\tTheta\tPi\tTajimasD')
if args.frame:
outfile.write('\tPiN\tNumN\tPiS\tNumS')
if outgroup is not None:
for o in outgroup:
outfile.write('\tMK_' + o)
outfile.write('\tNI_' + o)
outfile.write('\n')
for a in alignDict:
s = alignDict[a]
if len(s) == 0:
print a, " is not in frame"
continue
outfile.write('%s\t%s\t%s\t%s' % (a, s['theta'], s['pi'], s['tajimaD']))
if args.frame:
outfile.write('\t%s\t%s\t%s\t%s' % (s['piN'], s['NS'], s['piS'], s['S']))
if outgroup is not None:
for o in outgroup:
outfile.write('\t' + str(s['MK_' + o]))
outfile.write('\t' + str(s['NI_' + o]))
outfile.write('\n')
outfile.close()
args = get_arguments()
alignDict = {}
# Check if alignment or directory was given and calculate stats accordingly
if args.alignment is None:
for align in glob.glob(args.directory + '*.aln'):
alignName = os.path.splitext(align)[0].replace(args.directory, "")
alignDict[alignName] = calc_stats(align, args.outgroup)
else:
alignName = os.path.splitext(args.alignment)[0]
alignDict[alignName] = calc_stats(args.alignment, args.outgroup)
write_outfile(alignDict, args.outgroup)