Skip to content

Commit

Permalink
Merge pull request #3 from rbutleriii/batch_fix_2
Browse files Browse the repository at this point in the history
rsID batching via epost
  • Loading branch information
Robert Butler authored Mar 28, 2018
2 parents 14b942b + 0625f6b commit f8e9e43
Show file tree
Hide file tree
Showing 4 changed files with 69 additions and 83 deletions.
8 changes: 4 additions & 4 deletions clinotator/clinotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,13 +71,13 @@ def getargs():
# choosing log options
def log_opts(log, long_log, outprefix):
if log:
logging.basicConfig(level=logging.WARN,
logging.basicConfig(level=logging.INFO,
filename="{}.log".format(outprefix))
elif long_log:
logging.basicConfig(level=logging.DEBUG,
filename="{}.log".format(outprefix))
else:
logging.basicConfig(level=logging.WARN)
logging.basicConfig(level=logging.INFO)

# how to handle file types, returns vcf_tbl or False for output
def input_selection(file_type, file, outprefix, query_results):
Expand Down Expand Up @@ -141,7 +141,7 @@ def output_files(vcf_tbl, variant_objects, outprefix):
'CVNA', 'CVDS', 'CVLE', 'CTRS', 'CTAA', 'CTPS', 'CTRR']

result_tbl = pd.DataFrame([{fn: getattr(variant, fn) for fn in columnz}
for variant in variant_objects])
for variant in variant_objects])
result_tbl = result_tbl[columnz]
result_tbl['VID'] = result_tbl['VID'].astype(int)
result_tbl.sort_values(by='VID', inplace=True)
Expand Down Expand Up @@ -169,7 +169,7 @@ def main():
Entrez.email = args.email

for file in args.input:
logging.warning('Starting on {}'.format(file))
logging.info('Starting on {}'.format(file))
query_results = []
variant_objects = []
base = os.path.basename(file)
Expand Down
134 changes: 60 additions & 74 deletions clinotator/getncbi.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,40 @@ def timing_tool():
except:
return time.time()

# http attempts loop for biopython, assumes Bio.Entrez as Entrez
def http_attempts(query_type, **kwargs):
for attempt in range(4): # HTTPError and retry three times
try:
fetch_handle = getattr(Entrez, query_type)(**kwargs)
except ValueError as oops:
logging.warning('Empty set: Likely total number = batch size')
break
except HTTPError as err:
if 500 <= err.code <= 599:
logging.info("Attempt {} of 4".format(attempt + 1))
logging.warning(err)
time.sleep(15)
elif err.code == 400:
logging.info("Attempt {} of 4".format(attempt + 1))
logging.warning(err)
sys.tracebacklimit = None
time.sleep(15)
else:
raise
else:
break
else:
logging.critical('Failed to connect to NCBI 4 times exiting...')
sys.exit()
return fetch_handle

# epost list to e-utilities history server for target db.
# Can't elink over 1000, no max on epost
def post_ncbi(file_type, query_type, **kwargs):
logging.debug('{} to ncbi using kwargs: {}'.format(query_type, kwargs))
handle = getattr(Entrez, query_type)(**kwargs)
query = Entrez.read(handle)
logging.debug(query)
handle.close()
fetch_handle = http_attempts(query_type, **kwargs)
query = Entrez.read(fetch_handle)
fetch_handle.close()

if file_type == 'rsid' and query_type == 'elink':
webenv = query[0]['WebEnv']
Expand All @@ -54,86 +80,46 @@ def batch_ncbi(query_type, query_results, id_list, **kwargs):

for start in range(0, count, g.efetch_batch):
end = min(count, start + g.efetch_batch)
logging.info("Going to download record {} to {}"
.format(start + 1, end))

logging.debug('{} run with {}'
.format(query_type, dict(retstart=start, **kwargs)))
print("Going to download record {} to {}".format(start + 1, end))

for attempt in range(4): # HTTPError and retry three times
try:
fetch_handle = getattr(Entrez, query_type) \
(**dict(retstart=start, **kwargs))
except ValueError as oops:
logging.warning('Empty set: Likely total number = batch size')
break
except HTTPError as err:
if 500 <= err.code <= 599:
print("Attempt {} of 4".format(attempt + 1))
logging.warning(err)
time.sleep(15)
elif err.code == 400:
print("Attempt {} of 4".format(attempt + 1))
logging.warning(err)
sys.tracebacklimit = None
time.sleep(15)
else:
raise
else:
break
else:
logging.critical('Failed to connect to NCBI 4 times exiting...')
sys.exit()

fetch_handle = http_attempts(query_type,
**dict(retstart=start, **kwargs))
# Ideally Entrez.read would import as python object, and validate with
# ClinVar, but ClinVar no longer declares DOCTYPE for validation in
# their xml returns. They have .xsd for this purpose, but users
# downloading their own validation file is silly, plus there is no
# option with Entrez.read to specify the .xsd file. Ultimately, the
# best option for now is to utilize ElementTree for parsing. >Revisit<
# ClinVar, but ClinVar currently deoesn't declare DOCTYPE for
# validation in their xml header. They have .xsd for this purpose, but
# Entrez.read can't use this w/o a declaration to validate. Something
# of a security flaw, but hopefully NCBI is in control of their xml
# and it is injection free (hopefully...). Ultimately, the best option
# for now is to utilize ElementTree for parsing. >Revisit<
# data = Entrez.read(fetch_handle)
# print(data)
# fetch_handle.close()
query_results.append(fetch_handle.read())
fetch_handle.close()
return

# E-utilities batch queries w/o history server, useful for elink.
# Unreliable over batch size of 1000
def batch_nohist(query_type, id_list, **kwargs):
# batch the queries locally, then epost->etuility.
# elink can't handle batch size > 1000, and doesn't support retstart,retmax.
# this is then the only way to epost -> elink,egquery,etc.
def batch_local(file_type, query_type, id_list, **kwargs):
count = len(id_list)
result_list = []

for start in range(0, count, g.elink_batch):
end = min(count, start + g.elink_batch)
logging.debug('{} run with {}'
.format(query_type,
dict(id=",".join(id_list[start:end]), **kwargs)))
print("Looking up VIDs for rsIDs {} to {}".format(start + 1, end))

for attempt in range(4): # HTTPError and retry three times
try:
fetch_handle = getattr(Entrez, query_type) \
(**dict(id=",".join(id_list[start:end]), **kwargs))
except ValueError as oops:
logging.warning('Empty set: Likely total number = batch size')
break
except HTTPError as err:
if 500 <= err.code <= 599:
print("Attempt {} of 4".format(attempt + 1))
logging.warning(err)
time.sleep(15)
elif err.code == 400:
print("Attempt {} of 4".format(attempt + 1))
logging.warning(err)
sys.tracebacklimit = None
time.sleep(15)
else:
raise
else:
break
else:
logging.critical('Failed to connect to NCBI 4 times, exiting...')
sys.exit()

logging.info("Looking up VIDs for rsIDs {} to {}"
.format(start + 1, end))
webenv1, query_key1 = post_ncbi(
file_type, 'epost', db=kwargs['dbfrom'],
id=",".join(id_list[start:end]))
logging.debug('{} run with {}'.format(query_type, dict(
webenv=webenv1, query_key=query_key1, **kwargs)))
fetch_handle = http_attempts(query_type, **dict(
webenv=webenv1, query_key=query_key1, **kwargs))
record = Entrez.read(fetch_handle)
fetch_handle.close()
result_list.extend(
[link['Id'] for link in record[0]['LinkSetDb'][0]['Link']])
return result_list
Expand All @@ -147,13 +133,13 @@ def get_ncbi_xml(file_type, id_list, query_results):

if file_type == 'rsid':
id_list = list(filter(None, id_list)) # costs ~ 6 sec per 50k list
vid_list = batch_nohist('elink', id_list, db='clinvar',
vid_list = batch_local(file_type, 'elink', id_list, db='clinvar',
dbfrom='snp', linkname='snp_clinvar')
elif file_type == 'vid':
vid_list = list(filter(None, id_list)) # costs ~ 6 sec per 50k list
else:
logging.critical('Error: Incorrect file_type argument in get_rsid_xml ->'
' {}'.format(file_type))
logging.critical('Error: Incorrect file_type argument in get_rsid_xml'
' -> {}'.format(file_type))

webenv1, query_key1 = post_ncbi(file_type, 'epost', db='clinvar',
id=",".join(vid_list))
Expand All @@ -163,5 +149,5 @@ def get_ncbi_xml(file_type, id_list, query_results):
[query_results.append(batch) for batch in staging_list]
stop = timing_tool()
logging.info('Download time: {} min, Batches run -> {}'
.format(((stop-start)/60), len(query_results)))
.format(((stop-start)/60), len(query_results)))
return
6 changes: 3 additions & 3 deletions clinotator/global_vars.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,14 @@
See main, eventually tests will be added for this module
'''

__version__ = "0.3.0"
__version__ = "0.4.0"


### getncbi.py global variables

# batch size for querying NCBI, not above 5000
elink_batch = 1000
efetch_batch = 4500
elink_batch = 1000 # modify for rsID to VID lookup (max 1000)
efetch_batch = 4500 # modify for Variation record download size (max 10000)

# name of software for ncbi query tagging (preferred by NCBI)
etool = 'Clinotator'
Expand Down
4 changes: 2 additions & 2 deletions clinotator/vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@ def parse_header(file_object, outprefix):
with open('{}.anno.vcf'.format(outprefix), 'w') as outfile:
header = []
info_list = []
for index, line in enumerate(next(file_object)
for x in range(g.max_vcf_header_size)):
for index, line in enumerate(
next(file_object) for x in range(g.max_vcf_header_size)):
m = re.match('##(\w+)=', line)

if m and m.group(1) == 'INFO':
Expand Down

0 comments on commit f8e9e43

Please sign in to comment.