forked from cimbusch/TWGBS
-
Notifications
You must be signed in to change notification settings - Fork 1
/
TWGBS_read_pair_reconstruction.py
executable file
·175 lines (153 loc) · 6.94 KB
/
TWGBS_read_pair_reconstruction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
#!/usr/bin/python
"""
Code to reconstruct correct R1-R2 reads relation from base ratio:
R1R2_layout = (R1_T_C > R2_T_C) and (R2_A_G > R1_A_G)
R2R1_layout = (R1_T_C < R2_T_C) and (R2_A_G < R1_A_G)
read pairs which don't fall into these categories are ignored
Charles Imbusch ([email protected]), G200
"""
import gzip
import argparse
parser = argparse.ArgumentParser(description='Alignment-free program to reconstruct correct R1-R2 reads relation from T/C and A/G base ratio in TWGBS data.')
parser.add_argument('--R1_in', help='R1 input file', required=True)
parser.add_argument('--R2_in', help='R2 input file', required=True)
parser.add_argument('--input_ascii', help='Set if input fastq files are *not* compressed', action='store_true', required=False, default=False)
parser.add_argument('--R1_out', help='R1 output file', required=True)
parser.add_argument('--R2_out', help='R2 output file', required=True)
parser.add_argument('--R1_unassigned', help='Filename R1 for unassigned read pairs', required=True)
parser.add_argument('--R2_unassigned', help='Filename R2 for unassigned read pairs', required=True)
parser.add_argument('--output_ascii', help='Set if output fastq files should be *not* compressed', action='store_true', required=False, default=False)
parser.add_argument('--log', help='Write basic statistics to this file', required=False, default=False)
parser.add_argument('--debug', help='Debug and write more output to console', action='store_true', required=False, default=False)
args = parser.parse_args()
args_dict = vars(args)
# buffer size when writing out uncompressed fastq files
bufsize = 0
# init some variables
f = None
f2 = None
f_R1_out = None
f_R2_out = None
R1_unassigned = None
R2_unassigned = None
# file handlers for input files
if args_dict['input_ascii'] == False:
f = gzip.open(args_dict['R1_in'], 'r')
f2 = gzip.open(args_dict['R2_in'], 'r')
else:
f = open(args_dict['R1_in'], 'r')
f2 = open(args_dict['R2_in'], 'r')
# file handlers for output files
if args_dict['output_ascii'] == False:
f_R1_out = gzip.open(args_dict['R1_out'], 'wb')
f_R2_out = gzip.open(args_dict['R2_out'], 'wb')
R1_unassigned = gzip.open(args_dict['R1_unassigned'], 'wb')
R2_unassigned = gzip.open(args_dict['R2_unassigned'], 'wb')
else:
f_R1_out = open(args_dict['R1_out'], 'wb', bufsize)
f_R2_out = open(args_dict['R2_out'], 'wb', bufsize)
R1_unassigned = open(args_dict['R1_unassigned'], 'wb', bufsize)
R2_unassigned = open(args_dict['R2_unassigned'], 'wb', bufsize)
if args_dict['log'] == True:
f_log = open(args_dict['log'], 'wb')
# for some statistics
num_R1R2 = 0
num_R2R1 = 0
num_unsure = 0
### count A/T/C/G content in a given sequence
def getBaseContent(DNAsequence):
A = DNAsequence.count("A")
T = DNAsequence.count("T")
C = DNAsequence.count("C")
G = DNAsequence.count("G")
N = DNAsequence.count("N")
return (A, T, C, G, N)
### count base content masking CG/TG content
def getContentR1(DNAsequence):
nCG = DNAsequence.count("CG")
nTG = DNAsequence.count("TG")
total = getBaseContent(DNAsequence)
return (total[0],
total[1] - nTG,
total[2] - nCG,
total[3] - nCG - nTG,
total[4])
### count base content masking GC/GT content
def getContentR2(DNAsequence):
nGC = DNAsequence.count("GC")
nGT = DNAsequence.count("GT")
total = getBaseContent(DNAsequence)
return (total[0],
total[1] - nGT,
total[2] - nGC,
total[3] - nGC - nGT,
total[4])
def fastqEntry(header, seq, spacer, qual):
return "%s\n%s\n%\n%s\n".format(header, seq, spacer, qual)
if args_dict['debug'] == True:
print("R1_A\tR1_T\tR1_C\tR1_G\tR1_N\tR2_A\tR2_T\tR2_C\tR2_G\tR2_N\tR1_header")
print("Processing reads...")
# do it forever...
while True:
header1 = f.readline().strip()
# ...ah only if the first line is not empty we go on
if not header1:
break
sequence1 = f.readline().strip()
spacer1 = f.readline().strip()
qual_scores1 = f.readline().strip()
header2 = f2.readline().strip()
sequence2 = f2.readline().strip()
spacer2 = f2.readline().strip()
qual_scores2 = f2.readline().strip()
assert header1.split(" ")[0] == header2.split(" ")[0], "headers are not the same: %r" % header1 + "\t" + header2
R1_content = getContentR1(sequence1)
R2_content = getContentR2(sequence2)
### expert knowledge approach: R1: T:C,A:G versus R2: T:C,A:G;
R1_T_C = (R1_content[1]+1)/(R1_content[2]+1)
R1_A_G = (R1_content[0]+1)/(R1_content[3]+1)
R2_T_C = (R2_content[1]+1)/(R2_content[2]+1)
R2_A_G = (R2_content[0]+1)/(R2_content[3]+1)
# for debugging
if args_dict['debug'] == True:
print("==")
print("R1:R2\t" + str(R1_content) + "\t" + str(R2_content) + "\tR1:T:C|R1:A:G\t" + str(R1_T_C) + "\t" +str(R1_A_G))
print("R2:R1\t" + str(R2_content) + "\t" + str(R1_content) + "\tR2:T:C|R2:A:G\t" + str(R2_T_C) + "\t" +str(R2_A_G))
R1R2_layout = (R1_T_C > R2_T_C) and (R2_A_G > R1_A_G)
R2R1_layout = (R1_T_C < R2_T_C) and (R2_A_G < R1_A_G)
if R1R2_layout:
R1_output = header1 + "\n" + sequence1 + "\n" + spacer1 + "\n" + qual_scores1 + "\n"
R2_output = header2 + "\n" + sequence2 + "\n" + spacer2 + "\n" + qual_scores2 + "\n"
num_R1R2 += 1
f_R1_out.write(R1_output)
f_R2_out.write(R2_output)
if R2R1_layout:
# swapping sequences but keeping the header the same
R1_output = header1 + "\n" + sequence2 + "\n" + spacer2 + "\n" + qual_scores2 + "\n"
R2_output = header2 + "\n" + sequence1 + "\n" + spacer1 + "\n" + qual_scores1 + "\n"
num_R2R1 += 1
f_R1_out.write(R1_output)
f_R2_out.write(R2_output)
if R1R2_layout==False and R2R1_layout==False:
# write unassigned read pairs in the same order as they came in
R1_output = header1 + "\n" + sequence1 + "\n" + spacer1 + "\n" + qual_scores1 + "\n"
R2_output = header2 + "\n" + sequence2 + "\n" + spacer2 + "\n" + qual_scores2 + "\n"
R1_unassigned.write(R1_output)
R2_unassigned.write(R2_output)
num_unsure += 1
# write basic statistics
reads_total = num_R1R2 + num_R2R1 + num_unsure * 1.0
reads_written = num_R1R2 + num_R2R1
statistics_summary = "number of R1R2 read-pairs:\t" + str(num_R1R2) + ", " + str(num_R1R2/reads_total * 100) + "%\n" + "number of R2R1 read-pairs:\t" + str(num_R2R1) + ", " + str(num_R2R1/reads_total * 100) + "%\n" + "number of unsure read-pairs:\t" + str(num_unsure) + ", " + str(num_unsure/reads_total * 100) + "%\n" + "number of reads-pairs written:\t" + str(reads_written) + ", " + str(reads_written/reads_total * 100) + "%\n" + "number of total reads-pairs:\t" + str(reads_total) + "\n"
if args_dict['log'] == True:
f_log.write(statistics_summary)
print(statistics_summary)
# close file handlers
f.close()
f2.close()
f_R1_out.close()
f_R2_out.close()
if args_dict['log'] == True:
f_log.close()
R1_unassigned.close()
R2_unassigned.close()