Skip to content

Commit

Permalink
Merge pull request #145 from biothings/refactor-orthoAGR-plugin
Browse files Browse the repository at this point in the history
Refactor Plugin
  • Loading branch information
jal347 authored Apr 5, 2024
2 parents c30b6e9 + 9103c8f commit f6c4507
Show file tree
Hide file tree
Showing 2 changed files with 161 additions and 80 deletions.
2 changes: 0 additions & 2 deletions src/plugins/orthology_agr/manifest.json
Original file line number Diff line number Diff line change
@@ -1,14 +1,12 @@
{
"version": "0.2",
"requires" : ["pandas"],
"__metadata__":{
"url": "https://www.alliancegenome.org/",
"license_url": "https://creativecommons.org/licenses/by/4.0/",
"license": "CC BY 4.0"
},
"dumper" : {
"data_url" : "https://fms.alliancegenome.org/download/ORTHOLOGY-ALLIANCE_COMBINED.tsv.gz",
"uncompress" : true,
"release": "parser:setup_release"
},
"uploader" : {
Expand Down
239 changes: 161 additions & 78 deletions src/plugins/orthology_agr/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,12 @@
import os
from biothings_client import get_client
import biothings.utils.dataload as dl
import time
import logging
import json


logging = logging.getLogger("orthology_agr")

def setup_release(self):
release="2019-06-23"
Expand All @@ -15,95 +21,172 @@ def setup_release(self):

def convert_score(score):
"""
Converts the isbestscore and isbestrevscore
from a string to boolean.
score: "Yes" or "No" string
Converts the isbestscore and isbestrevscore
from a string to boolean.
score: "Yes" or "No" string
"""
if score=="Yes":
return True;
else:
return False;
return score == "Yes"

def get_entrez_id(gene_id, gene_client, bad_queries):
"""
Use the biothings_client package to query the mygene.info db for
the corresponding entrezgene(NCBI gene) ID for the given gene_id
gene_id: given id to search on
gene_client: initialized object for connection to mygene.info
"""
#gene_client = get_client('gene') # initialize mygene object
if "WB:" in gene_id: gene_id=gene_id.replace("WB:", "WormBase:")
if "FB:" in gene_id: gene_id=gene_id.replace("FB:", "FLYBASE:")
if "MGI:" in gene_id: gene_id=gene_id.replace("MGI:", "mgi:MGI\:")
#gene=gene_client.getgene(gene_id, fields='symbol,name')

#print("[INFO] searching for gene id ", gene_id)
gene=gene_client.query(gene_id, fields='symbol,name') # get search results
# check if gene id was not found
if not gene["hits"]:
#print("[INFO] Failed query on ID %s "%gene_id)
bad_queries.append(gene_id)
else:
#print("[INFO] Success query on ID %s "%gene_id)
gene_id=gene["hits"][0]["_id"]

return gene_id;


def load_orthology(data_folder):

def adjust_gene_ids(gene_id_list):
"""
Main Method - data plugin parser for ortholog data from AGR
data_folder: input folder (standard for biothings studio plugin)
Adjusts gene IDs in the given list by replacing specified prefixes with their corresponding replacements.
Args:
gene_id_list (list): List of gene IDs to be adjusted.
Returns:
list: Adjusted list of gene IDs.
"""
# Adjust prefixes directly in the list
adjustments = {
"WB:": "WormBase:",
"FB:": "FLYBASE:",
"XB": "XenBase:XB",
}

# setup data from the file
print("[INFO] loading orthology AGR data....")
infile = os.path.join(data_folder, "ORTHOLOGY-ALLIANCE_COMBINED.tsv")
assert os.path.exists(infile)
for i, gene_id in enumerate(gene_id_list):
for prefix, replacement in adjustments.items():
if gene_id.startswith(prefix):
gene_id_list[i] = gene_id.replace(prefix, replacement)
break

gene_client = get_client('gene')
records=[]# initialize record list for orthologs
gene_dict={}
return gene_id_list

bad_queries=[] # initialize gene query ids that return None (empty)

# use dataload from biothings util to load tsv file
for i, x in enumerate(list(dl.tabfile_feeder(infile, header=15, sep="\t"))):
# pull out first row for the column header names
if i == 0: cols_names=x, print('[COLS] %s \n'%" ".join(x))
# if it isn't the first row, continue with id record
else:
# initialize data holders
gene_id=get_entrez_id(x[0], gene_client, bad_queries)
ortholog_id=get_entrez_id(x[4], gene_client, bad_queries)
def get_gene_cache(unique_gene_ids, gene_client, batch_size=1000):
"""
Builds a cache of gene IDs using gene client queries.
Args:
unique_gene_ids (set): Set of unique gene IDs.
gene_client: Client for querying gene information.
batch_size (int): Batch size for querying gene IDs.
Returns:
dict: A dictionary containing gene IDs as keys and their corresponding Entrez IDs as values.
"""
gene_id_cache = {}
first_pass_fail_ct = 0
error_ids = []

# Convert set to list for manipulation
unique_gene_ids=adjust_gene_ids(list(unique_gene_ids))
# Convert back to set if you need to remove duplicates after adjustment
unique_gene_ids = set(unique_gene_ids)

def batch(iterable, n=1):
"""Yield successive n-sized batches from iterable."""
# Convert set to list if iterable is a set
if isinstance(iterable, set):
iterable = list(iterable)

if gene_id not in gene_dict.keys():

gene_dict[gene_id]={"orthologs": []}

# setup dictionary record
ortholog_dict={
"geneid": ortholog_id,
"symbol": x[5],
"taxid": int(float(x[6].split(":")[1])),
"algorithmsmatch": x[9],
"outofalgorithms": x[10],
"isbestscore": convert_score(x[11]),
"isbestrevscore": convert_score(x[12])
}
gene_dict[gene_id]["orthologs"].append(ortholog_dict)
l = len(iterable)
for ndx in range(0, l, n):
yield iterable[ndx:min(ndx + n, l)]

# Iterate through batches and perform query
for batch_ids in batch(unique_gene_ids, batch_size):
try:
results = gene_client.querymany(
batch_ids,
scopes=["WormBase", "MGI", "FLYBASE", "Xenbase", "WB"],
fields='symbol,name,entrezgene',
verbose=False
)
for result in results:
query_id = result['query']
if 'notfound' in result:
first_pass_fail_ct += 1
# Second pass query -- attempt to get Entrez ID directly
try:
query_res = gene_client.query(query_id, fields='entrezgene')
if query_res.get('hits'):
for hit in query_res['hits']:
if 'entrezgene' in hit:
gene_id_cache[query_id] = hit['entrezgene']
else:
logging.info(f"query ID not found in second individual query: {query_id}")
error_ids.append(query_id)
except Exception as e:
logging.error(f"Error in second pass query for {query_id}: {e}")
else:
# Process found queries
entrez_id = result.get('entrezgene')
if entrez_id:
gene_id_cache[query_id] = entrez_id
else:
logging.info(f"Missing data for found query: {result.get('query', 'Unknown ID')}")
except Exception as e:
logging.error(f"Error in batch query: {e}")

logging.info("Completed building gene ID cache.")
logging.info(f"First pass fail count: {first_pass_fail_ct}")
logging.info(f"Final error IDs (complete fail): {len(error_ids)}")

return gene_id_cache

def load_orthology(data_folder): #, sample_limit=100):
"""
Main method for parsing ortholog data from AGR.
gene_list=sorted(gene_dict.items())
Args:
data_folder (str): Path to the folder containing the ortholog data file.
sample_limit (int): Maximum number of samples to process (default is 100). **only used for testing**
Returns:
list: List of ortholog records.
"""
start_time = time.time()

for d in gene_list:
_id = d[0]
ortholog_dict=d[1]

record= {
"_id": _id,
"agr":ortholog_dict
infile = os.path.join(data_folder, "ORTHOLOGY-ALLIANCE_COMBINED.tsv.gz")
logging.info(f"Starting upload for orthology AGR from {infile}")
assert os.path.exists(infile), "Input file does not exist."

gene_client = get_client('gene')
unique_gene_ids = set()

# Collect unique gene IDs
line_count = 0
for line in dl.tabfile_feeder(infile, header=16, sep="\t"):
# if sample_limit and line_count >= sample_limit:
# break
unique_gene_ids.update([line[0], line[4]])
line_count += 1

# Fetch gene IDs in parallel -- use batch with querymany
gene_id_cache = get_gene_cache(unique_gene_ids, gene_client)

# Construct records using fetched gene IDs
records = {}
line_count = 0
for line in dl.tabfile_feeder(infile, header=16, sep="\t"):
# if sample_limit and line_count >= sample_limit:
# break
gene_id, ortholog_id = gene_id_cache.get(line[0]), gene_id_cache.get(line[4])
if gene_id and ortholog_id:
# Create a record if gene_id not in records, or append ortholog if gene_id already exists
if gene_id not in records:
records[gene_id] = {
"_id": gene_id,
"orthologs": []
}
records.append(record)
ortholog_record = {
"gene_id": gene_id,
"ortholog_id": ortholog_id,
"symbol": line[5],
"taxid": int(float(line[6].split(":")[1])),
"algorithmsmatch": line[9],
"outofalgorithms": line[10],
"isbestscore": convert_score(line[11]),
"isbestrevscore": convert_score(line[12])
}
records[gene_id]["orthologs"].append(ortholog_record)
line_count += 1 # Increment the line counter

elapsed_time = time.time() - start_time
logging.info(f"Completed processing {sum(len(rec['orthologs']) for rec in records.values())} ortholog records for {len(records)} genes in {elapsed_time:.2f} seconds.")
if records:
logging.info(f"Sample record: {json.dumps(records[next(iter(records))], indent=2)}")

return records;
return list(records.values())

0 comments on commit f6c4507

Please sign in to comment.