Merge pull request #145 from biothings/refactor-orthoAGR-plugin

Refactor Plugin
biothings · Apr 5, 2024 · f6c4507 · f6c4507
2 parents c30b6e9 + 9103c8f
commit f6c4507
Show file tree

Hide file tree

Showing 2 changed files with 161 additions and 80 deletions.
diff --git a/src/plugins/orthology_agr/manifest.json b/src/plugins/orthology_agr/manifest.json
@@ -1,14 +1,12 @@
 {
     "version": "0.2",
-    "requires" : ["pandas"],
     "__metadata__":{
         "url": "https://www.alliancegenome.org/",
         "license_url": "https://creativecommons.org/licenses/by/4.0/",
         "license": "CC BY 4.0"
     },
     "dumper" : {
         "data_url" : "https://fms.alliancegenome.org/download/ORTHOLOGY-ALLIANCE_COMBINED.tsv.gz", 
-        "uncompress" : true,
         "release": "parser:setup_release"
     },
     "uploader" : {

diff --git a/src/plugins/orthology_agr/parser.py b/src/plugins/orthology_agr/parser.py
@@ -7,6 +7,12 @@
 import os
 from biothings_client import get_client
 import biothings.utils.dataload as dl
+import time
+import logging
+import json
+
+
+logging = logging.getLogger("orthology_agr")
 
 def setup_release(self):
     release="2019-06-23"
@@ -15,95 +21,172 @@ def setup_release(self):
 
 def convert_score(score):
     """
-        Converts the isbestscore and isbestrevscore 
-        from a string to boolean. 
-        score: "Yes" or "No" string 
+    Converts the isbestscore and isbestrevscore 
+    from a string to boolean. 
+    score: "Yes" or "No" string 
     """
-    if score=="Yes":
-        return True;
-    else:
-        return False;
+    return score == "Yes"
 
-def get_entrez_id(gene_id, gene_client, bad_queries):
-    """
-        Use the biothings_client package to query the mygene.info db for 
-        the corresponding entrezgene(NCBI gene) ID for the given gene_id
-        gene_id: given id to search on
-        gene_client: initialized object for connection to mygene.info 
-    """
-    #gene_client = get_client('gene') # initialize mygene object
-    if "WB:" in gene_id: gene_id=gene_id.replace("WB:", "WormBase:")
-    if "FB:" in gene_id: gene_id=gene_id.replace("FB:", "FLYBASE:")
-    if "MGI:" in gene_id: gene_id=gene_id.replace("MGI:", "mgi:MGI\:")
-    #gene=gene_client.getgene(gene_id, fields='symbol,name')
-
-    #print("[INFO] searching for gene id ", gene_id)
-    gene=gene_client.query(gene_id, fields='symbol,name') # get search results
-    # check if gene id was not found
-    if not gene["hits"]:
-        #print("[INFO] Failed query on ID %s "%gene_id)
-        bad_queries.append(gene_id)
-    else:
-        #print("[INFO] Success query on ID %s "%gene_id)
-        gene_id=gene["hits"][0]["_id"]
-
-    return gene_id;
-
-
-def load_orthology(data_folder):
+
+def adjust_gene_ids(gene_id_list):
     """
-        Main Method - data plugin parser for ortholog data from AGR
-        data_folder: input folder (standard for biothings studio plugin)
+    Adjusts gene IDs in the given list by replacing specified prefixes with their corresponding replacements.
+
+    Args:
+        gene_id_list (list): List of gene IDs to be adjusted.
+
+    Returns:
+        list: Adjusted list of gene IDs.
     """
+    # Adjust prefixes directly in the list
+    adjustments = {
+        "WB:": "WormBase:",
+        "FB:": "FLYBASE:",
+        "XB": "XenBase:XB",
+    }
 
-    # setup data from the file
-    print("[INFO] loading orthology AGR data....")
-    infile = os.path.join(data_folder, "ORTHOLOGY-ALLIANCE_COMBINED.tsv")
-    assert os.path.exists(infile)
+    for i, gene_id in enumerate(gene_id_list):
+        for prefix, replacement in adjustments.items():
+            if gene_id.startswith(prefix):
+                gene_id_list[i] = gene_id.replace(prefix, replacement)
+                break
 
-    gene_client = get_client('gene')
-    records=[]# initialize record list for orthologs
-    gene_dict={}
+    return gene_id_list
 
-    bad_queries=[] # initialize gene query ids that return None (empty)
 
-    # use dataload from biothings util to load tsv file
-    for i, x in enumerate(list(dl.tabfile_feeder(infile, header=15, sep="\t"))):
-        # pull out first row for the column header names
-        if i == 0: cols_names=x, print('[COLS] %s \n'%" ".join(x))
-        # if it isn't the first row, continue with id record
-        else:
-            # initialize data holders
-            gene_id=get_entrez_id(x[0], gene_client, bad_queries)
-            ortholog_id=get_entrez_id(x[4], gene_client, bad_queries)
+def get_gene_cache(unique_gene_ids, gene_client, batch_size=1000):
+    """
+    Builds a cache of gene IDs using gene client queries.
+
+    Args:
+        unique_gene_ids (set): Set of unique gene IDs.
+        gene_client: Client for querying gene information.
+        batch_size (int): Batch size for querying gene IDs.
 
+    Returns:
+        dict: A dictionary containing gene IDs as keys and their corresponding Entrez IDs as values.
+    """
+    gene_id_cache = {}
+    first_pass_fail_ct = 0
+    error_ids = []
+
+    # Convert set to list for manipulation
+    unique_gene_ids=adjust_gene_ids(list(unique_gene_ids))
+    # Convert back to set if you need to remove duplicates after adjustment
+    unique_gene_ids = set(unique_gene_ids)
+
+    def batch(iterable, n=1):
+        """Yield successive n-sized batches from iterable."""
+        # Convert set to list if iterable is a set
+        if isinstance(iterable, set):
+            iterable = list(iterable)
 
-            if gene_id not in gene_dict.keys(): 
-
-                gene_dict[gene_id]={"orthologs": []}
-
-            # setup dictionary record
-            ortholog_dict={
-                "geneid": ortholog_id,
-                "symbol": x[5],
-                "taxid": int(float(x[6].split(":")[1])),
-                "algorithmsmatch": x[9],
-                "outofalgorithms": x[10],
-                "isbestscore": convert_score(x[11]),
-                "isbestrevscore": convert_score(x[12])
-            }
-            gene_dict[gene_id]["orthologs"].append(ortholog_dict)
+        l = len(iterable)
+        for ndx in range(0, l, n):
+            yield iterable[ndx:min(ndx + n, l)]
+
+    # Iterate through batches and perform query
+    for batch_ids in batch(unique_gene_ids, batch_size):
+        try: 
+            results = gene_client.querymany(
+                batch_ids,
+                scopes=["WormBase", "MGI", "FLYBASE", "Xenbase", "WB"],
+                fields='symbol,name,entrezgene',
+                verbose=False
+            )
+            for result in results:
+                query_id = result['query']
+                if 'notfound' in result:
+                    first_pass_fail_ct += 1
+                    # Second pass query -- attempt to get Entrez ID directly
+                    try:
+                        query_res = gene_client.query(query_id, fields='entrezgene')
+                        if query_res.get('hits'):
+                            for hit in query_res['hits']:
+                                if 'entrezgene' in hit:
+                                    gene_id_cache[query_id] = hit['entrezgene']
+                        else:
+                            logging.info(f"query ID not found in second individual query: {query_id}")
+                            error_ids.append(query_id)
+                    except Exception as e:
+                        logging.error(f"Error in second pass query for {query_id}: {e}")
+                else:
+                    # Process found queries
+                    entrez_id = result.get('entrezgene')
+                    if entrez_id:
+                        gene_id_cache[query_id] = entrez_id
+                    else:
+                        logging.info(f"Missing data for found query: {result.get('query', 'Unknown ID')}")
+        except Exception as e:
+            logging.error(f"Error in batch query: {e}")
+
+    logging.info("Completed building gene ID cache.")
+    logging.info(f"First pass fail count: {first_pass_fail_ct}")
+    logging.info(f"Final error IDs (complete fail): {len(error_ids)}")
+
+    return gene_id_cache
+
+def load_orthology(data_folder): #, sample_limit=100):
+    """
+    Main method for parsing ortholog data from AGR.
 
-    gene_list=sorted(gene_dict.items())
+    Args:
+        data_folder (str): Path to the folder containing the ortholog data file.
+        sample_limit (int): Maximum number of samples to process (default is 100). **only used for testing**
+
+    Returns:
+        list: List of ortholog records.
+    """
+    start_time = time.time()
 
-    for d in gene_list:
-        _id = d[0]  
-        ortholog_dict=d[1]
-
-        record= {
-                "_id": _id,
-                "agr":ortholog_dict
+    infile = os.path.join(data_folder, "ORTHOLOGY-ALLIANCE_COMBINED.tsv.gz")
+    logging.info(f"Starting upload for orthology AGR from {infile}")
+    assert os.path.exists(infile), "Input file does not exist."
+
+    gene_client = get_client('gene')
+    unique_gene_ids = set() 
+
+    # Collect unique gene IDs
+    line_count = 0
+    for line in dl.tabfile_feeder(infile, header=16, sep="\t"):
+        # if sample_limit and line_count >= sample_limit:
+        #     break
+        unique_gene_ids.update([line[0], line[4]])
+        line_count += 1
+
+    # Fetch gene IDs in parallel -- use batch with querymany
+    gene_id_cache = get_gene_cache(unique_gene_ids, gene_client)
+
+    # Construct records using fetched gene IDs
+    records = {}
+    line_count = 0
+    for line in dl.tabfile_feeder(infile, header=16, sep="\t"):
+        # if sample_limit and line_count >= sample_limit:
+        #     break
+        gene_id, ortholog_id = gene_id_cache.get(line[0]), gene_id_cache.get(line[4])
+        if gene_id and ortholog_id:
+            # Create a record if gene_id not in records, or append ortholog if gene_id already exists
+            if gene_id not in records:
+                records[gene_id] = {
+                    "_id": gene_id,
+                    "orthologs": []
                 }
-        records.append(record)
+            ortholog_record = {
+                "gene_id": gene_id,
+                "ortholog_id": ortholog_id,
+                "symbol": line[5],
+                "taxid": int(float(line[6].split(":")[1])),
+                "algorithmsmatch": line[9],
+                "outofalgorithms": line[10],
+                "isbestscore": convert_score(line[11]),
+                "isbestrevscore": convert_score(line[12])
+            }
+            records[gene_id]["orthologs"].append(ortholog_record)
+            line_count += 1  # Increment the line counter
+
+    elapsed_time = time.time() - start_time
+    logging.info(f"Completed processing {sum(len(rec['orthologs']) for rec in records.values())} ortholog records for {len(records)} genes in {elapsed_time:.2f} seconds.")
+    if records:
+        logging.info(f"Sample record: {json.dumps(records[next(iter(records))], indent=2)}")
 
-    return records;
+    return list(records.values())