From 0625f6bc5a54fef0de6c706b87e61868b30b594d Mon Sep 17 00:00:00 2001 From: rbutleriii Date: Wed, 28 Mar 2018 18:23:14 -0500 Subject: [PATCH] rsID batching via epost --- clinotator/clinotator.py | 8 +-- clinotator/getncbi.py | 134 +++++++++++++++++--------------------- clinotator/global_vars.py | 6 +- clinotator/vcf.py | 4 +- 4 files changed, 69 insertions(+), 83 deletions(-) diff --git a/clinotator/clinotator.py b/clinotator/clinotator.py index bafd6eb..309495e 100755 --- a/clinotator/clinotator.py +++ b/clinotator/clinotator.py @@ -71,13 +71,13 @@ def getargs(): # choosing log options def log_opts(log, long_log, outprefix): if log: - logging.basicConfig(level=logging.WARN, + logging.basicConfig(level=logging.INFO, filename="{}.log".format(outprefix)) elif long_log: logging.basicConfig(level=logging.DEBUG, filename="{}.log".format(outprefix)) else: - logging.basicConfig(level=logging.WARN) + logging.basicConfig(level=logging.INFO) # how to handle file types, returns vcf_tbl or False for output def input_selection(file_type, file, outprefix, query_results): @@ -141,7 +141,7 @@ def output_files(vcf_tbl, variant_objects, outprefix): 'CVNA', 'CVDS', 'CVLE', 'CTRS', 'CTAA', 'CTPS', 'CTRR'] result_tbl = pd.DataFrame([{fn: getattr(variant, fn) for fn in columnz} - for variant in variant_objects]) + for variant in variant_objects]) result_tbl = result_tbl[columnz] result_tbl['VID'] = result_tbl['VID'].astype(int) result_tbl.sort_values(by='VID', inplace=True) @@ -169,7 +169,7 @@ def main(): Entrez.email = args.email for file in args.input: - logging.warning('Starting on {}'.format(file)) + logging.info('Starting on {}'.format(file)) query_results = [] variant_objects = [] base = os.path.basename(file) diff --git a/clinotator/getncbi.py b/clinotator/getncbi.py index 6441f69..b53a99b 100644 --- a/clinotator/getncbi.py +++ b/clinotator/getncbi.py @@ -28,14 +28,40 @@ def timing_tool(): except: return time.time() +# http attempts loop for biopython, assumes Bio.Entrez as Entrez +def http_attempts(query_type, **kwargs): + for attempt in range(4): # HTTPError and retry three times + try: + fetch_handle = getattr(Entrez, query_type)(**kwargs) + except ValueError as oops: + logging.warning('Empty set: Likely total number = batch size') + break + except HTTPError as err: + if 500 <= err.code <= 599: + logging.info("Attempt {} of 4".format(attempt + 1)) + logging.warning(err) + time.sleep(15) + elif err.code == 400: + logging.info("Attempt {} of 4".format(attempt + 1)) + logging.warning(err) + sys.tracebacklimit = None + time.sleep(15) + else: + raise + else: + break + else: + logging.critical('Failed to connect to NCBI 4 times exiting...') + sys.exit() + return fetch_handle + # epost list to e-utilities history server for target db. # Can't elink over 1000, no max on epost def post_ncbi(file_type, query_type, **kwargs): logging.debug('{} to ncbi using kwargs: {}'.format(query_type, kwargs)) - handle = getattr(Entrez, query_type)(**kwargs) - query = Entrez.read(handle) - logging.debug(query) - handle.close() + fetch_handle = http_attempts(query_type, **kwargs) + query = Entrez.read(fetch_handle) + fetch_handle.close() if file_type == 'rsid' and query_type == 'elink': webenv = query[0]['WebEnv'] @@ -54,86 +80,46 @@ def batch_ncbi(query_type, query_results, id_list, **kwargs): for start in range(0, count, g.efetch_batch): end = min(count, start + g.efetch_batch) + logging.info("Going to download record {} to {}" + .format(start + 1, end)) + logging.debug('{} run with {}' .format(query_type, dict(retstart=start, **kwargs))) - print("Going to download record {} to {}".format(start + 1, end)) - - for attempt in range(4): # HTTPError and retry three times - try: - fetch_handle = getattr(Entrez, query_type) \ - (**dict(retstart=start, **kwargs)) - except ValueError as oops: - logging.warning('Empty set: Likely total number = batch size') - break - except HTTPError as err: - if 500 <= err.code <= 599: - print("Attempt {} of 4".format(attempt + 1)) - logging.warning(err) - time.sleep(15) - elif err.code == 400: - print("Attempt {} of 4".format(attempt + 1)) - logging.warning(err) - sys.tracebacklimit = None - time.sleep(15) - else: - raise - else: - break - else: - logging.critical('Failed to connect to NCBI 4 times exiting...') - sys.exit() - + fetch_handle = http_attempts(query_type, + **dict(retstart=start, **kwargs)) # Ideally Entrez.read would import as python object, and validate with - # ClinVar, but ClinVar no longer declares DOCTYPE for validation in - # their xml returns. They have .xsd for this purpose, but users - # downloading their own validation file is silly, plus there is no - # option with Entrez.read to specify the .xsd file. Ultimately, the - # best option for now is to utilize ElementTree for parsing. >Revisit< + # ClinVar, but ClinVar currently deoesn't declare DOCTYPE for + # validation in their xml header. They have .xsd for this purpose, but + # Entrez.read can't use this w/o a declaration to validate. Something + # of a security flaw, but hopefully NCBI is in control of their xml + # and it is injection free (hopefully...). Ultimately, the best option + # for now is to utilize ElementTree for parsing. >Revisit< # data = Entrez.read(fetch_handle) - # print(data) # fetch_handle.close() query_results.append(fetch_handle.read()) + fetch_handle.close() return -# E-utilities batch queries w/o history server, useful for elink. -# Unreliable over batch size of 1000 -def batch_nohist(query_type, id_list, **kwargs): +# batch the queries locally, then epost->etuility. +# elink can't handle batch size > 1000, and doesn't support retstart,retmax. +# this is then the only way to epost -> elink,egquery,etc. +def batch_local(file_type, query_type, id_list, **kwargs): count = len(id_list) result_list = [] for start in range(0, count, g.elink_batch): end = min(count, start + g.elink_batch) - logging.debug('{} run with {}' - .format(query_type, - dict(id=",".join(id_list[start:end]), **kwargs))) - print("Looking up VIDs for rsIDs {} to {}".format(start + 1, end)) - - for attempt in range(4): # HTTPError and retry three times - try: - fetch_handle = getattr(Entrez, query_type) \ - (**dict(id=",".join(id_list[start:end]), **kwargs)) - except ValueError as oops: - logging.warning('Empty set: Likely total number = batch size') - break - except HTTPError as err: - if 500 <= err.code <= 599: - print("Attempt {} of 4".format(attempt + 1)) - logging.warning(err) - time.sleep(15) - elif err.code == 400: - print("Attempt {} of 4".format(attempt + 1)) - logging.warning(err) - sys.tracebacklimit = None - time.sleep(15) - else: - raise - else: - break - else: - logging.critical('Failed to connect to NCBI 4 times, exiting...') - sys.exit() - + logging.info("Looking up VIDs for rsIDs {} to {}" + .format(start + 1, end)) + webenv1, query_key1 = post_ncbi( + file_type, 'epost', db=kwargs['dbfrom'], + id=",".join(id_list[start:end])) + logging.debug('{} run with {}'.format(query_type, dict( + webenv=webenv1, query_key=query_key1, **kwargs))) + fetch_handle = http_attempts(query_type, **dict( + webenv=webenv1, query_key=query_key1, **kwargs)) record = Entrez.read(fetch_handle) + fetch_handle.close() result_list.extend( [link['Id'] for link in record[0]['LinkSetDb'][0]['Link']]) return result_list @@ -147,13 +133,13 @@ def get_ncbi_xml(file_type, id_list, query_results): if file_type == 'rsid': id_list = list(filter(None, id_list)) # costs ~ 6 sec per 50k list - vid_list = batch_nohist('elink', id_list, db='clinvar', + vid_list = batch_local(file_type, 'elink', id_list, db='clinvar', dbfrom='snp', linkname='snp_clinvar') elif file_type == 'vid': vid_list = list(filter(None, id_list)) # costs ~ 6 sec per 50k list else: - logging.critical('Error: Incorrect file_type argument in get_rsid_xml ->' - ' {}'.format(file_type)) + logging.critical('Error: Incorrect file_type argument in get_rsid_xml' + ' -> {}'.format(file_type)) webenv1, query_key1 = post_ncbi(file_type, 'epost', db='clinvar', id=",".join(vid_list)) @@ -163,5 +149,5 @@ def get_ncbi_xml(file_type, id_list, query_results): [query_results.append(batch) for batch in staging_list] stop = timing_tool() logging.info('Download time: {} min, Batches run -> {}' - .format(((stop-start)/60), len(query_results))) + .format(((stop-start)/60), len(query_results))) return diff --git a/clinotator/global_vars.py b/clinotator/global_vars.py index 9d6fd50..43f4b95 100644 --- a/clinotator/global_vars.py +++ b/clinotator/global_vars.py @@ -10,14 +10,14 @@ See main, eventually tests will be added for this module ''' -__version__ = "0.3.0" +__version__ = "0.4.0" ### getncbi.py global variables # batch size for querying NCBI, not above 5000 -elink_batch = 1000 -efetch_batch = 4500 +elink_batch = 1000 # modify for rsID to VID lookup (max 1000) +efetch_batch = 4500 # modify for Variation record download size (max 10000) # name of software for ncbi query tagging (preferred by NCBI) etool = 'Clinotator' diff --git a/clinotator/vcf.py b/clinotator/vcf.py index 8379c58..5fe40a9 100644 --- a/clinotator/vcf.py +++ b/clinotator/vcf.py @@ -28,8 +28,8 @@ def parse_header(file_object, outprefix): with open('{}.anno.vcf'.format(outprefix), 'w') as outfile: header = [] info_list = [] - for index, line in enumerate(next(file_object) - for x in range(g.max_vcf_header_size)): + for index, line in enumerate( + next(file_object) for x in range(g.max_vcf_header_size)): m = re.match('##(\w+)=', line) if m and m.group(1) == 'INFO':