forked from Sethupathy-Lab/miRquant
-
Notifications
You must be signed in to change notification settings - Fork 0
/
final_processing.py
147 lines (121 loc) · 4.73 KB
/
final_processing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
#!/usr/bin/python2
usage='''
Usage:
python final_processing.py /path/to/samples_directory
where: samples_directory = location of sample fastqs
Description:
This is a wrapper script for the various scripts generating the outputs for the
final report. The scripts that are wrapped here are:
generate_mapping_info.py - generates the mapping statistics for all samples
lenDist.py - generates the read length histogram for all samples
lenDistGraph.R - generates read length histogram image for all samples
smRNAseq_correlation.R - generates correlation table and image for sample dists
genNormalRPMMM.py - generates the reads per million miRs mapped for all samples
genNormalRPMM.py - generates the reads per million mapped for all samples
For more information on what the various scripts do, please consult the script
file.
'''
import sys
import os
import glob
import argparse
import datetime
from bin.utils import load_mirquant_config_file
from bin.final_analysis import f_utils, \
generate_mapping_info, \
lenDist, \
generate_normalized_counts, \
generate_normalized_RPMYM_counts, \
statistics, \
assemble_xls, \
autoDESeq
def create_output_folder(outPath):
'''
Create a uniquely named output folder for final results
'''
now = datetime.datetime.now()
c = 1
while True:
d = '{}/{}_{}_{}_miRquant_{}/'.format(outPath, now.year, now.month, now.day, c)
try:
os.makedirs(d)
break
except OSError:
c += 1
return d
def sample_input_location(basePath, outPath):
'''
Get the sample input location for the final output scripts
'''
samps = [s[:-1] for s in glob.glob('{}/*.'.format(basePath))]
return ['{}{}/output/'.format(outPath, os.path.basename(s)) for s in samps]
def mapping_stats(basePath, outPath, samples):
'''
Run mapping_stats.py on all the samples
'''
print "Calculating mapping statistics... "
generate_mapping_info.main(basePath, outPath, samples)
print "DONE!\n"
def length_distribution(outPath, samples):
'''
Run lenDist.py on all the samples, with the --image flag
'''
print "Running length distribution script..."
lenDist.main(outPath, samples)
print "DONE!\n"
def RPMMandRPMMM(species, base_path, outPath, samples):
'''
Run RPMM and RPMMM on all the samples
'''
print "Generating normalized counts tables..."
generate_normalized_counts.main(species, outPath, base_path, samples)
if species == 'hsa':
generate_normalized_RPMYM_counts.main(species, outPath, base_path, 'RPMYM', 100, samples)
print "DONE!\n"
def calculate_statistics(basePath, outPath):
'''
If conditions.txt exists, calculate statistics.
If comparisons.txt exists, calculate the pair-wise comparisons.
'''
if os.path.exists('{}/conditions.csv'.format(basePath)):
print "Calculating statistics..."
os.system('cp {}/conditions.csv {}/'.format(basePath, outPath))
os.system('cp {}/comparisons.csv {}/'.format(basePath, outPath))
RPMMM = '{}/RPMMM_mirs_over_50.csv'.format(outPath)
cond = '{}/conditions.csv'.format(outPath)
comp = '{}/comparisons.csv'.format(outPath)
statistics.main(RPMMM, cond, comp, outPath)
print "DONE!\n"
def DESeq(basePath, outPath, D):
'''
If DESeq flag given and conditions file exists, run DESeq2 on
the raw counts file.
'''
if os.path.exists('{}/conditions.csv'.format(basePath)) and D:
print "Running DESeq..."
autoDESeq.runDESeq(outPath)
print "DONE!\n"
def main(arg):
cfg = load_mirquant_config_file(arg.conf)
outPath = create_output_folder(cfg['paths']['output'])
samples = sample_input_location(cfg['paths']['project'], cfg['paths']['output'])
mapping_stats(cfg['paths']['project'], outPath, samples)
length_distribution(outPath, samples)
RPMMandRPMMM(cfg['parameters']['species'], cfg['paths']['project'], outPath, samples)
calculate_statistics(cfg['paths']['project'], outPath)
DESeq(cfg['paths']['project'], outPath, arg.DESeq)
assemble_xls.main(outPath)
if __name__ == '__main__':
f_utils.check_for_input(sys.argv, usage)
parser = argparse.ArgumentParser(
description='Runs all the final summary scripts')
parser.add_argument(
'conf',
action='store',
help='Path to configuration directory')
parser.add_argument(
'-d','--DESeq',
action='store_true',
help='Run DESeq2 on raw counts file (requires conditions file)')
arg = parser.parse_args()
main(arg)