Skip to content

Commit

Permalink
pinnacle embeddings and keys... unittest failt
Browse files Browse the repository at this point in the history
  • Loading branch information
amva13 committed Jul 27, 2024
1 parent 7ce38a3 commit cd9abfa
Show file tree
Hide file tree
Showing 4 changed files with 43 additions and 2 deletions.
4 changes: 3 additions & 1 deletion tdc/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -925,6 +925,7 @@ def get_task2category():
"cell_tissue_mg_edgelist": "txt",
"pinnacle_global_ppi_edgelist": "txt",
"pinnacle_protein_embed": "pth",
"pinnacle_labels_dict": "txt",
}

name2id = {
Expand Down Expand Up @@ -1099,7 +1100,8 @@ def get_task2category():
"tchard_pep_cdr3b_only_sampled_negs_train-4": 10228326,
"cell_tissue_mg_edgelist": 10407107,
"pinnacle_global_ppi_edgelist": 10407108,
"pinnacle_protein_embed": 10407128
"pinnacle_protein_embed": 10407128,
"pinnacle_labels_dict": 10407642,
}

oracle2type = {
Expand Down
12 changes: 11 additions & 1 deletion tdc/resource/pinnacle.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from ..utils import general_load
from ..utils.load import resource_dataset_load
from ..utils.load import resource_dataset_load, load_json_from_txt_file

import pandas as pd

class PINNACLE:

Expand All @@ -12,6 +14,7 @@ def __init__(self, path="./data"):
self.cell_tissue_mg.columns = ["Tissue", "Cell"]
self.embeds_name = "pinnacle_protein_embed"
self.embeds = resource_dataset_load(self.embeds_name, path, [self.embeds_name])
self.keys = load_json_from_txt_file("pinnacle_labels_dict", path)

def get_ppi(self):
return self.ppi
Expand All @@ -21,6 +24,13 @@ def get_mg(self):

def get_embeds(self):
return self.embeds

def get_keys(self):
protein_names_celltypes = [p for p in zip(self.keys["Cell Type"], self.keys["Name"]) if not (p[0].startswith("BTO") or p[0].startswith("CCI") or p[0].startswith("Sanity"))]
proteins = pd.DataFrame.from_dict({"target":[n for _,n in protein_names_celltypes], "cell type":[c for c,_ in protein_names_celltypes]})
proteins.drop_duplicates()
return proteins




Expand Down
12 changes: 12 additions & 0 deletions tdc/test/test_resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,18 @@ def test_mg_ppi_load(self):
assert isinstance(embeds, DataFrame)
assert len(embeds) > 0, "PINNACLE embeds is empty"

def test_embeddings(self):
from tdc.resource.pinnacle import PINNACLE
pinnacle = PINNACLE()
embeds = pinnacle.get_embeds()
assert isinstance(embeds, DataFrame)
assert len(embeds) > 0, "PINNACLE embeds is empty"
keys = pinnacle.get_keys()
assert isinstance(keys, DataFrame)
assert len(keys) > 0, "PINNACLE keys is empty"
assert len(keys) == len(embeds), "{} vs {}".format(len(keys), len(embeds))



if __name__ == "__main__":
unittest.main()
17 changes: 17 additions & 0 deletions tdc/utils/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -349,6 +349,23 @@ def pd_load(name, path):
"TDC is hosted in Harvard Dataverse and it is currently under maintenance, please check back in a few hours or checkout https://dataverse.harvard.edu/."
)

def load_json_from_txt_file(name, path):
import json
import re
name = download_wrapper(name, path, [name])
file_path = os.path.join(path, name + ".txt")
with open(file_path, 'r') as f:
data = f.read()
# data = re.sub(r"(?<!\\)'", '"', data)
data = data.replace("\'", "\"")
file_content = json.loads(data)
maxlen = max(len(x) for x in file_content.values())
for k, v in file_content.items():
r = maxlen - len(v)
file_content[k] = v + [None] * r
df = pd.DataFrame(file_content)
return df


def property_dataset_load(name, path, target, dataset_names):
"""a wrapper to download, process and load single-instance prediction task datasets
Expand Down

0 comments on commit cd9abfa

Please sign in to comment.