Merge pull request #3 from rbutleriii/batch_fix_2

rsID batching via epost
rbutleriii · Mar 28, 2018 · f8e9e43 · f8e9e43
2 parents 14b942b + 0625f6b
commit f8e9e43
Show file tree

Hide file tree

Showing 4 changed files with 69 additions and 83 deletions.
diff --git a/clinotator/clinotator.py b/clinotator/clinotator.py
@@ -71,13 +71,13 @@ def getargs():
 # choosing log options
 def log_opts(log, long_log, outprefix):
     if log:
-        logging.basicConfig(level=logging.WARN,
+        logging.basicConfig(level=logging.INFO,
                             filename="{}.log".format(outprefix))
     elif long_log:
         logging.basicConfig(level=logging.DEBUG,
                             filename="{}.log".format(outprefix))
     else:
-        logging.basicConfig(level=logging.WARN)
+        logging.basicConfig(level=logging.INFO)
 
 # how to handle file types, returns vcf_tbl or False for output 
 def input_selection(file_type, file, outprefix, query_results):
@@ -141,7 +141,7 @@ def output_files(vcf_tbl, variant_objects, outprefix):
                'CVNA', 'CVDS', 'CVLE', 'CTRS', 'CTAA', 'CTPS', 'CTRR']
 
     result_tbl = pd.DataFrame([{fn: getattr(variant, fn) for fn in columnz}
-        for variant in variant_objects])
+            for variant in variant_objects])
     result_tbl = result_tbl[columnz]
     result_tbl['VID'] = result_tbl['VID'].astype(int)
     result_tbl.sort_values(by='VID', inplace=True)
@@ -169,7 +169,7 @@ def main():
     Entrez.email = args.email
 
     for file in args.input:
-        logging.warning('Starting on {}'.format(file))
+        logging.info('Starting on {}'.format(file))
         query_results = []
         variant_objects = []
         base = os.path.basename(file)

diff --git a/clinotator/getncbi.py b/clinotator/getncbi.py
@@ -28,14 +28,40 @@ def timing_tool():
     except:
         return time.time()
 
+# http attempts loop for biopython, assumes Bio.Entrez as Entrez
+def http_attempts(query_type, **kwargs):
+    for attempt in range(4): # HTTPError and retry three times
+        try:
+            fetch_handle = getattr(Entrez, query_type)(**kwargs)
+        except ValueError as oops:
+            logging.warning('Empty set: Likely total number = batch size')
+            break
+        except HTTPError as err:
+            if 500 <= err.code <= 599:
+                logging.info("Attempt {} of 4".format(attempt + 1))
+                logging.warning(err)
+                time.sleep(15)
+            elif err.code == 400:
+                logging.info("Attempt {} of 4".format(attempt + 1))
+                logging.warning(err)
+                sys.tracebacklimit = None
+                time.sleep(15)
+            else:
+                raise
+        else:
+            break
+    else:
+        logging.critical('Failed to connect to NCBI 4 times exiting...')
+        sys.exit()
+    return fetch_handle
+
 # epost list to e-utilities history server for target db.
 # Can't elink over 1000, no max on epost
 def post_ncbi(file_type, query_type, **kwargs):
     logging.debug('{} to ncbi using kwargs: {}'.format(query_type, kwargs))
-    handle = getattr(Entrez, query_type)(**kwargs)
-    query = Entrez.read(handle)
-    logging.debug(query)
-    handle.close()
+    fetch_handle = http_attempts(query_type, **kwargs)
+    query = Entrez.read(fetch_handle)
+    fetch_handle.close()
 
     if file_type == 'rsid' and query_type == 'elink':
         webenv = query[0]['WebEnv']
@@ -54,86 +80,46 @@ def batch_ncbi(query_type, query_results, id_list, **kwargs):
 
     for start in range(0, count, g.efetch_batch):
         end = min(count, start + g.efetch_batch)
+        logging.info("Going to download record {} to {}"
+                     .format(start + 1, end))
+
         logging.debug('{} run with {}'
                       .format(query_type, dict(retstart=start, **kwargs)))
-        print("Going to download record {} to {}".format(start + 1, end))
-
-        for attempt in range(4): # HTTPError and retry three times
-            try:
-                fetch_handle = getattr(Entrez, query_type) \
-                        (**dict(retstart=start, **kwargs))
-            except ValueError as oops:
-                logging.warning('Empty set: Likely total number = batch size')
-                break
-            except HTTPError as err:
-                if 500 <= err.code <= 599:
-                    print("Attempt {} of 4".format(attempt + 1))
-                    logging.warning(err)
-                    time.sleep(15)
-                elif err.code == 400:
-                    print("Attempt {} of 4".format(attempt + 1))
-                    logging.warning(err)
-                    sys.tracebacklimit = None
-                    time.sleep(15)
-                else:
-                    raise
-            else:
-                break
-        else:
-            logging.critical('Failed to connect to NCBI 4 times exiting...')
-            sys.exit()
-
+        fetch_handle = http_attempts(query_type,
+                                     **dict(retstart=start, **kwargs))
         # Ideally Entrez.read would import as python object, and validate with
-        # ClinVar, but ClinVar no longer declares DOCTYPE for validation in
-        # their xml returns. They have .xsd for this purpose, but users
-        # downloading their own validation file is silly, plus there is no
-        # option with Entrez.read to specify the .xsd file. Ultimately, the
-        # best option for now is to utilize ElementTree for parsing. >Revisit<
+        # ClinVar, but ClinVar currently deoesn't declare DOCTYPE for 
+        # validation in their xml header. They have .xsd for this purpose, but
+        # Entrez.read can't use this w/o a declaration to validate. Something
+        # of a security flaw, but hopefully NCBI is in control of their xml
+        # and it is injection free (hopefully...). Ultimately, the best option
+        # for now is to utilize ElementTree for parsing. >Revisit<
         # data = Entrez.read(fetch_handle)
-        # print(data)
         # fetch_handle.close()
         query_results.append(fetch_handle.read())
+        fetch_handle.close()
     return
 
-# E-utilities batch queries w/o history server, useful for elink.
-# Unreliable over batch size of 1000
-def batch_nohist(query_type, id_list, **kwargs):
+# batch the queries locally, then epost->etuility.
+# elink can't handle batch size > 1000, and doesn't support retstart,retmax.
+# this is then the only way to epost -> elink,egquery,etc.
+def batch_local(file_type, query_type, id_list, **kwargs):
     count = len(id_list)
     result_list = []
 
     for start in range(0, count, g.elink_batch):
         end = min(count, start + g.elink_batch)
-        logging.debug('{} run with {}'
-                      .format(query_type,
-                              dict(id=",".join(id_list[start:end]), **kwargs)))
-        print("Looking up VIDs for rsIDs {} to {}".format(start + 1, end))
-
-        for attempt in range(4): # HTTPError and retry three times
-            try:
-                fetch_handle = getattr(Entrez, query_type) \
-                        (**dict(id=",".join(id_list[start:end]), **kwargs))
-            except ValueError as oops:
-                logging.warning('Empty set: Likely total number = batch size')
-                break
-            except HTTPError as err:
-                if 500 <= err.code <= 599:
-                    print("Attempt {} of 4".format(attempt + 1))
-                    logging.warning(err)
-                    time.sleep(15)
-                elif err.code == 400:
-                    print("Attempt {} of 4".format(attempt + 1))
-                    logging.warning(err)
-                    sys.tracebacklimit = None
-                    time.sleep(15)
-                else:
-                    raise
-            else:
-                break
-        else:
-            logging.critical('Failed to connect to NCBI 4 times, exiting...')
-            sys.exit()
-
+        logging.info("Looking up VIDs for rsIDs {} to {}"
+                     .format(start + 1, end))
+        webenv1, query_key1 = post_ncbi(
+                file_type, 'epost', db=kwargs['dbfrom'],
+                id=",".join(id_list[start:end]))
+        logging.debug('{} run with {}'.format(query_type, dict(
+                webenv=webenv1, query_key=query_key1, **kwargs)))
+        fetch_handle = http_attempts(query_type, **dict(
+                webenv=webenv1, query_key=query_key1, **kwargs))
         record = Entrez.read(fetch_handle)
+        fetch_handle.close()
         result_list.extend(
                 [link['Id'] for link in record[0]['LinkSetDb'][0]['Link']])
     return result_list
@@ -147,13 +133,13 @@ def get_ncbi_xml(file_type, id_list, query_results):
 
     if file_type == 'rsid':
         id_list = list(filter(None, id_list)) # costs ~ 6 sec per 50k list
-        vid_list = batch_nohist('elink', id_list, db='clinvar',
+        vid_list = batch_local(file_type, 'elink', id_list, db='clinvar',
                                 dbfrom='snp', linkname='snp_clinvar')
     elif file_type == 'vid':
         vid_list = list(filter(None, id_list)) # costs ~ 6 sec per 50k list
     else:
-        logging.critical('Error: Incorrect file_type argument in get_rsid_xml ->'
-                      ' {}'.format(file_type))
+        logging.critical('Error: Incorrect file_type argument in get_rsid_xml'
+                ' -> {}'.format(file_type))
 
     webenv1, query_key1 = post_ncbi(file_type, 'epost', db='clinvar',
                                     id=",".join(vid_list))
@@ -163,5 +149,5 @@ def get_ncbi_xml(file_type, id_list, query_results):
     [query_results.append(batch) for batch in staging_list] 
     stop = timing_tool()
     logging.info('Download time: {} min, Batches run -> {}'
-                  .format(((stop-start)/60), len(query_results)))
+                 .format(((stop-start)/60), len(query_results)))
     return
diff --git a/clinotator/global_vars.py b/clinotator/global_vars.py
@@ -10,14 +10,14 @@
 See main, eventually tests will be added for this module
 '''
 
-__version__ = "0.3.0"
+__version__ = "0.4.0"
 
 
 ### getncbi.py global variables 
 
 # batch size for querying NCBI, not above 5000
-elink_batch = 1000
-efetch_batch = 4500
+elink_batch = 1000 # modify for rsID to VID lookup (max 1000)
+efetch_batch = 4500 # modify for Variation record download size (max 10000)
 
 # name of software for ncbi query tagging (preferred by NCBI)
 etool = 'Clinotator'

diff --git a/clinotator/vcf.py b/clinotator/vcf.py
@@ -28,8 +28,8 @@ def parse_header(file_object, outprefix):
         with open('{}.anno.vcf'.format(outprefix), 'w') as outfile:
             header = []
             info_list = []
-            for index, line in enumerate(next(file_object)
-                                         for x in range(g.max_vcf_header_size)):
+            for index, line in enumerate(
+                    next(file_object) for x in range(g.max_vcf_header_size)):
                 m = re.match('##(\w+)=', line)
 
                 if m and m.group(1) == 'INFO':