-
Notifications
You must be signed in to change notification settings - Fork 11
/
getdata_accessions.py
126 lines (108 loc) · 4.48 KB
/
getdata_accessions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import os
import os.path
from os.path import basename
import subprocess
from subprocess import Popen, PIPE
import glob
# custom Lisa module
import clusterfunc_py3
# 1. Get url for download from list of accessions
def sra_url(accession):
"""
Takes an SRA accession and determines the location of the .sra data file for automated or downloading.
Follows this format: ftp://ftp-trace.ncbi.nih.gov/sra/sra-instant/reads/ByRun/sra/{SRR|ERR|DRR}/<first 6 characters of accession>/<accession>/<accession>.sra
Format according to NCBI utility handbook: https://www.ncbi.nlm.nih.gov/books/NBK158899/
"""
accession = accession.upper()
return "ftp://ftp-trace.ncbi.nih.gov/sra/sra-instant/reads/ByRun/sra/{}/{}/{}/{}.sra".format(
accession[0:3], accession[0:6], accession, accession)
def test_sra_url():
assert sra_url('DRR053698') == 'ftp://ftp-trace.ncbi.nih.gov/sra/sra-instant/reads/ByRun/sra/DRR/DRR053/DRR053698/DRR053698.sra'
# 2. Download data
#(already checked if file exists)
def download(url, newdir, newfile):
filestring = newdir + newfile
if os.path.isfile(filestring):
print("file exists:", filestring)
else:
urlstring = "wget -O " + newdir + newfile + " " + url
print(urlstring)
#s = subprocess.Popen(urlstring, shell=True)
#s.wait()
print("Finished downloading from NCBI.")
# 3. Extract with fastq-dump (sratools)
def sra_extract(newdir, filename):
# if seqtype=="single":
# sra_string="fastq-dump -v "+newdir+file
# print sra_string
# elif seqtype=="paired":
# check whether .fastq exists in directory
if glob.glob(newdir + "*.fastq"):
print("SRA has already been extracted", filename)
else:
sra_string = "fastq-dump -v -O " + newdir + " --split-3 " + newdir + filename
print(sra_string)
print("extracting SRA...")
#s = subprocess.Popen(sra_string, shell=True, stdout=PIPE)
#s.wait()
print("Finished SRA extraction.")
# 4. Generate fastqc from all fastq in directory
def fastqc_report(fastq_file_list, newdir, fastqcdir, filename):
# imports list of files in each directory
print(fastq_file_list)
print(fastqcdir + filename)
if glob.glob(fastqcdir + filename + "_*_fastqc.zip"):
print("fastqc already complete:", filename)
else:
# creates command to generate fastqc reports from all files in list
file_string = str(fastq_file_list)
# print fastq_file_list
file_string = " ".join(fastq_file_list)
# print file_string
fastqc_string = "fastqc -o " + fastqcdir + " " + file_string
print("fastqc reports being generated for: " + str(fastq_file_list))
fastqc_command = [fastqc_string]
process_name = "fastqc"
module_name_list = ""
filename = filename
clusterfunc_py3.qsub_file(fastqcdir, process_name,
module_name_list, filename, fastqc_command)
# this is the main function to execute
def execute(accession, basedir, url):
# Creates directory for each file to be downloaded
# Directory will be located according to organism and read type (single
# or paired)
seq_dir = basedir + accession + "/"
clusterfunc_py3.check_dir(seq_dir)
print(seq_dir)
fastqcdir = seq_dir + "fastqc/"
clusterfunc_py3.check_dir(fastqcdir)
filename = accession + ".sra"
if filename in os.listdir(seq_dir):
print("sra exists:", filename)
if os.stat(seq_dir + filename).st_size == 0:
print("SRA file is empty:", filename)
os.remove(full_filename)
else:
print("file will be downloaded:", filename)
download(url, seq_dir, filename)
sra_extract(seq_dir, filename)
fastqc(seq_dir, fastqcdir, filename)
def fastqc(newdir, fastqcdir, filename):
listoffiles = os.listdir(newdir)
print(listoffiles)
fastq_file_list = []
for i in listoffiles:
if i.endswith(".fastq"):
fastq_file_list.append(newdir + i)
fastqc_report(fastq_file_list, newdir, fastqcdir, filename)
accessions = "DRR053698, DRR082659, ERR489297, DRR030368, DRR031870, DRR046632, DRR069093, ERR058009, ERR1016675, SRR2086412, SRR3499127, SRR1789336, SRR2016923, ERR1674585, DRR036858"
accessions = accessions.replace(" ","").split(",")
print(accessions)
basedir = "/mnt/scratch/ljcohen/oysterriver/"
clusterfunc_py3.check_dir(basedir)
for accession in accessions:
url = sra_url(accession)
execute(accession,basedir,url)
#print url_data
#execute(basedir, url_data)