diff --git a/src/hub/dataload/sources/chembl/upload.py b/src/hub/dataload/sources/chembl/upload.py index d2038b2..ee1676e 100644 --- a/src/hub/dataload/sources/chembl/upload.py +++ b/src/hub/dataload/sources/chembl/upload.py @@ -9,11 +9,43 @@ from .parser import load_data +class ChemblMergerStorage(storage.MergerStorage): + @classmethod + def merge_func(klass, doc1, doc2, **kwargs): + merged_doc = {"_id": doc1["_id"], "chembl": []} + chembl_dict = {} + + for doc in [doc1, doc2]: + chembl_list = doc.get("chembl", []) + if not isinstance(chembl_list, list): + chembl_list = [chembl_list] + + for chembl in chembl_list: + uniprot = chembl.get("uniprot") + chembl_target = chembl.get("chembl_target") + + if uniprot in chembl_dict: + if isinstance(chembl_dict[uniprot]["chembl_target"], list): + chembl_dict[uniprot]["chembl_target"].append(chembl_target) + else: + chembl_dict[uniprot]["chembl_target"] = [ + chembl_dict[uniprot]["chembl_target"], + chembl_target, + ] + else: + chembl_dict[uniprot] = { + "chembl_target": chembl_target, + "uniprot": uniprot, + } + + merged_doc["chembl"] = list(chembl_dict.values()) + return merged_doc + + class ChemblUploader(uploader.BaseSourceUploader): name = "chembl" - # storage_class = storage.RootKeyMergerStorage - storage_class = storage.MergerStorage + storage_class = ChemblMergerStorage TARGET_FILENAME_PATTERN = "target.*.json" keylookup = MyGeneKeyLookup(