-
Notifications
You must be signed in to change notification settings - Fork 1
/
1_fastq_trim.py
110 lines (90 loc) · 3.35 KB
/
1_fastq_trim.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import argparse
import json
################################### FUNCTIONS #############################################
def mkdir(dir):
"""
Function to create a directory
"""
import os
if not os.path.exists(dir):
os.mkdir(dir)
def list_dir_files(dir,pattern = "None"):
"""
Function to list the files of a directory
"""
import glob
if pattern == "None":
files = glob.glob(f"{dir}/*")
else:
files = glob.glob(f"{dir}/*{pattern}*")
return files
def get_sample_name(file_names):
"""
Function to list the sample names of a list of fastq files
"""
import os
return(list(set([os.path.basename(file).split("_R1_")[0] for file in file_names if "_R1_" in file])))
def eval_fastqc_file(args):
"""
Function to run fastqc
"""
sample_dict,output,threads,run = args
import subprocess
if run == "1":
subprocess.run(f"fastqc {sample_dict['R1']} -o {output} -t {threads}",shell=True)
subprocess.run(f"fastqc {sample_dict['R2']} -o {output} -t {threads}",shell=True)
def eval_fastq_files(sample_dict,output,run):
"""
Function to run fastqc in multiple threads
"""
import multiprocessing
with multiprocessing.Pool(len(sample_dict)) as pool:
pool.map(eval_fastqc_file,[(sample_dict[sample_name],output,8,run) for sample_name in sample_dict])
def run_trimming(args):
"""
Function to run cutadapt
"""
import subprocess
sample_name, sample_dict, num_threads, run = args
fin1 = sample_dict["R1"]
fin2 = sample_dict["R2"]
fout1 = f"02_trim/{sample_name}_R1.fastq.gz"
fout2 = f"02_trim/{sample_name}_R2.fastq.gz"
if run == "1":
subprocess.run(f"cutadapt --quiet -j {num_threads} -u 1 -U 1 -m 20:20 -q 20 -o {fout1} -p {fout2} {fin1} {fin2}",shell=True)
return({sample_name:{"R1":fout1, "R2":fout2}})
def trimming_files(sample_dict,adapter,run):
"""
Function to run cutadapt in multiple threads
"""
import multiprocessing
import collections
with multiprocessing.Pool(len(sample_dict)) as pool:
sample_dict = pool.map(run_trimming,[(sample_name,sample_dict[sample_name],adapter,8,run) for sample_name in sample_dict])
sample_dict = dict(collections.ChainMap(*sample_dict))
return(sample_dict)
###################################################################################################################
parser = argparse.ArgumentParser()
parser.add_argument("-I", "--input-dir")
parser.add_argument("-L", "--run")
args = vars(parser.parse_args())
input_dir = args["input_dir"]
adapter = args["adapter"]
run = args["run"]
filenames = list_dir_files(input_dir,"fastq.gz")
sample_names = get_sample_name(filenames)
sample_dict = {}
for sample_name in sample_names:
fastq_file_r1 = [x for x in filenames if sample_name in x and "_R1_" in x][0]
sample_dict[sample_name]["R1"] = fastq_file_r1
fastq_file_r2 = [x for x in filenames if sample_name in x and "_R2_" in x][0]
sample_dict[sample_name]["R2"] = fastq_file_r2
mkdir("FastQC")
mkdir("FastQC/Raw")
mkdir("FastQC/Trim")
mkdir("02_trim")
eval_fastq_files(sample_dict,"FastQC/Raw",run)
sample_dict = trimming_files(sample_dict,run)
eval_fastq_files(sample_dict,"FastQC/Trim",run)
with open("00_log/1_2_fastq.json","w") as jsonfile:
json.dump(sample_dict,jsonfile,indent=4)