-
Notifications
You must be signed in to change notification settings - Fork 0
/
fastq_filter.py
134 lines (97 loc) · 4.25 KB
/
fastq_filter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#!/usr/bin/env python
import argparse
from tqdm import tqdm
import os
import sys
from jakomics.fastq import FASTQ
from jakomics import colors
import jak_utils
# stop those pesky future warnings....
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
# OPTIONS #####################################################################
parser = argparse.ArgumentParser(
description='test')
parser.add_argument('-s', '--samples',
help="excel file with samples in S, F, R, I columns",
required=True)
parser.add_argument('-a', '--amplicons', '--amplicon',
action='store_true',
help='Run amplicon workflow instead of shotgun workflow')
parser.add_argument('-q', '--quiet',
action='store_false',
help='Print out commands')
parser.add_argument('-m', '--memory',
help="Memory to pass to bbtools",
required=False,
default="Xmx8g")
parser.add_argument('-t', '--threads',
help="Threads to pass to bbtools",
required=False,
default=8)
parser.add_argument('--verify_pairs',
action='store_true',
help="Verify read pairs")
parser.add_argument('--out',
help="File to write results to",
default="fastq_filter_results.txt",
required=False)
args = parser.parse_args()
def format_stats(sample_series, filter_type, stats):
d = {}
for step in stats:
for type in stats[step]:
d[filter_type + '_' + type + '_' + step] = stats[step][type]
new = pd.Series(name=sample_series.name,
data=d
)
sample_series = pd.concat([sample_series, new], ignore_index=False)
return sample_series
if __name__ == "__main__":
jak_utils.header()
files = pd.read_excel(args.samples, index_col=0, engine='openpyxl')
contam_seqs = jak_utils.get_yaml("contams_db")
try:
with open(contam_seqs) as f:
print(f"{colors.bcolors.GREEN}{contam_seqs} found!{colors.bcolors.END}")
except FileNotFoundError:
sys.exit(f"{colors.bcolors.RED}ERROR: {contam_seqs} not found!{colors.bcolors.END}")
pbar = tqdm(total=len(files.index), desc="Filtered", unit=" fastq files")
df = pd.DataFrame(columns=['ORDER_VERIFIED'])
for sample, row in files.iterrows():
d = FASTQ(sample, row)
d.verify_read_pairs(echo=args.quiet, run=True, verify=args.verify_pairs)
sample_series = pd.Series(name=d.sample, data={'ORDER_VERIFIED': d.ordered})
if args.amplicons:
cf = d.contaminant_filtering(contam_seqs,
echo=args.quiet,
mem=args.memory,
threads=args.threads)
sample_series = format_stats(sample_series, 'CF', cf)
else:
rt = d.adapter_trimming(contam_seqs,
echo=args.quiet,
mem=args.memory,
threads=args.threads)
sample_series = format_stats(sample_series, 'RT', rt)
cf = d.contaminant_filtering(contam_seqs,
echo=args.quiet,
mem=args.memory,
threads=args.threads)
sample_series = format_stats(sample_series, 'CF', cf)
qf = d.quality_filtering(echo=args.quiet,
mem=args.memory,
threads=args.threads)
sample_series = format_stats(sample_series, 'QF', qf)
df = pd.concat([df, sample_series.to_frame().T], ignore_index=False)
pbar.update(1)
# write to file with comments
if os.path.exists(args.out):
os.remove(args.out)
f = open(args.out, 'a')
for c in jak_utils.header(r=True):
print(f'# {c}', file=f)
for arg in vars(args):
print(f'# ARG {arg} = {getattr(args, arg)}', file=f)
df.to_csv(f, sep="\t", index=True, index_label="SAMPLE")