diff --git a/bigbio/biodatasets/thomas2011/thomas2011.py b/bigbio/biodatasets/thomas2011/thomas2011.py index 427c4be2..e5c5734d 100644 --- a/bigbio/biodatasets/thomas2011/thomas2011.py +++ b/bigbio/biodatasets/thomas2011/thomas2011.py @@ -43,15 +43,16 @@ from pathlib import Path from shutil import rmtree from typing import Dict, List, Tuple - +import xml.etree.ElementTree as ET import datasets import pandas as pd +import requests from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig from bigbio.utils.constants import Lang, Tasks from bigbio.utils.license import CustomLicense - +import time _LANGUAGES = [Lang.EN] _PUBMED = True @@ -98,8 +99,7 @@ # this is a backup url in case the official one will stop working # _URLS = ["http://github.com/rockt/SETH/zipball/master/"] _URLS = { - "source": "https://www.scai.fraunhofer.de/content/dam/scai/de/downloads/bioinformatik/normalization-variation-corpus.gz", - "bigbio_kb": "https://www.scai.fraunhofer.de/content/dam/scai/de/downloads/bioinformatik/normalization-variation-corpus.gz", + _DATASETNAME: "https://www.scai.fraunhofer.de/content/dam/scai/de/downloads/bioinformatik/normalization-variation-corpus.gz", } _SUPPORTED_TASKS = [ @@ -117,35 +117,16 @@ class Thomas2011Dataset(datasets.GeneratorBasedBuilder): SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) BIGBIO_VERSION = datasets.Version(_BIGBIO_VERSION) - # You will be able to load the "source" or "bigbio" configurations with - # ds_source = datasets.load_dataset('my_dataset', name='source') - # ds_bigbio = datasets.load_dataset('my_dataset', name='bigbio') - - # For local datasets you can make use of the `data_dir` and `data_files` kwargs - # https://huggingface.co/docs/datasets/add_dataset.html#downloading-data-files-and-organizing-splits - # ds_source = datasets.load_dataset('my_dataset', name='source', data_dir="/path/to/data/files") - # ds_bigbio = datasets.load_dataset('my_dataset', name='bigbio', data_dir="/path/to/data/files") - - # TODO: For each dataset, implement Config for Source and BigBio; - # If dataset contains more than one subset (see examples/bioasq.py) implement for EACH of them. - # Each of them should contain: - # - name: should be unique for each dataset config eg. bioasq10b_(source|bigbio)_[bigbio_schema_name] - # - version: option = (SOURCE_VERSION|BIGBIO_VERSION) - # - description: one line description for the dataset - # - schema: options = (source|bigbio_[bigbio_schema_name]) - # - subset_id: subset id is the canonical name for the dataset (eg. bioasq10b) - # where [bigbio_schema_name] = (kb, pairs, qa, text, t2t, entailment) - BUILDER_CONFIGS = [ BigBioConfig( - name="thomas2011_source", + name=f"{_DATASETNAME}_source", version=SOURCE_VERSION, description="Thomas et al 2011 source schema", schema="source", subset_id="thomas2011", ), BigBioConfig( - name="thomas2011_bigbio_kb", + name=f"{_DATASETNAME}_bigbio_kb", version=BIGBIO_VERSION, description="Thomas et al 2011 BigBio schema", schema="bigbio_kb", @@ -153,16 +134,10 @@ class Thomas2011Dataset(datasets.GeneratorBasedBuilder): ), ] - DEFAULT_CONFIG_NAME = "thomas2011_source" + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" def _info(self) -> datasets.DatasetInfo: - # Create the source schema; this schema will keep all keys/information/labels as close to the original dataset as possible. - # Much of this design is copied from biodatasets/verspoor_2013/verspoor_2013.py - - # You can arbitrarily nest lists and dictionaries. - # For iterables, use lists over tuples or `datasets.Sequence` - if self.config.schema == "source": features = datasets.Features( { @@ -188,54 +163,53 @@ def _info(self) -> datasets.DatasetInfo: def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: """Returns SplitGenerators.""" - # Download gets entire git repo containing unused data from other datasets - # repo_dir = Path(dl_manager.download_and_extract(_URLS[0])) - # data_dir = repo_dir / "data" - # data_dir.mkdir(exist_ok=True) - - # Find the relevant files from Verspor2013 and move them to a new directory - # thomas2011_files = repo_dir.glob("*/*/*thomas2011/**/*") - # for file in thomas2011_files: - # if file.is_file() and "README" not in str(file): - # file.rename(data_dir / file.name) - - # Delete all unused files and directories from the original download - # for x in repo_dir.glob("[!data]*"): - # if x.is_file(): - # x.unlink() - # elif x.is_dir(): - # rmtree(x) - - data_dir = dl_manager.download_and_extract(_URLS[self.config.schema]) + data_dir = dl_manager.download_and_extract(_URLS[_DATASETNAME]) return [ datasets.SplitGenerator( - name=datasets.Split.TEST, + name=datasets.Split.TRAIN, # Whatever you put in gen_kwargs will be passed to _generate_examples gen_kwargs={ "filepath": os.path.join(data_dir, "annotations.txt"), - "split": "test", }, ) ] - # method parameters are unpacked from `gen_kwargs` as given in `_split_generators` - - # TODO: change the args of this function to match the keys in `gen_kwargs`. You may add any necessary kwargs. - def _generate_examples(self, filepath: str, split: str) -> Tuple[int, Dict]: + def get_clean_pubmed_abstract(self, id): + url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?" + params = { + "db": "pubmed", + "id": id, + "retmode": "xml", + "rettype": "medline", + } + res = requests.get(url, params=params) + tree = ET.XML(res.text) + article = tree.find("PubmedArticle").find("MedlineCitation").find("Article") + article_title = article.find("ArticleTitle").text + abstract_parts = [f"{article_title}"] + article_abstract = article.find("Abstract").findall("AbstractText") + for abstract_part in article_abstract: + label = abstract_part.attrib.get("Label", "") + if label: + abstract_parts.append(f"{label}: {abstract_part.text}") + else: + abstract_parts.append(abstract_part.text) + return article_title, " ".join(abstract_parts) + + def _generate_examples(self, filepath: str) -> Tuple[int, Dict]: """Yields examples as (key, example) tuples.""" - if split == "test": - data_ann = [] - with open(filepath, encoding="utf-8") as ann_tsv_file: - csv_reader_code = csv.reader( - ann_tsv_file, - quotechar="'", - delimiter="\t", - quoting=csv.QUOTE_ALL, - skipinitialspace=True, - ) - for id_, row in enumerate(csv_reader_code): - data_ann.append(row) + data_ann = [] + with open(filepath, encoding="utf-8") as ann_tsv_file: + csv_reader_code = csv.reader( + ann_tsv_file, + quotechar="'", + delimiter="\t", + quoting=csv.QUOTE_ALL, + skipinitialspace=True, + ) + for id_, row in enumerate(csv_reader_code): + data_ann.append(row) if self.config.schema == "source": for id_, row in enumerate(data_ann): @@ -259,20 +233,39 @@ def _generate_examples(self, filepath: str, split: str) -> Tuple[int, Dict]: ] df = pd.DataFrame(data_ann, columns=cols) uid = 0 + curr_count = 0 for id_ in df.doc_id.unique(): + curr_count += 1 + if curr_count == 3: + # The PubMed API limits 3 requests per second without an API key + time.sleep(0.5) + curr_count = 0 elist = [] + article_title, abstract_text = self.get_clean_pubmed_abstract(id_) + uid += 1 + passage = { + "id": uid, + "type": "", + "text": [abstract_text], + "offsets": [[0, len(abstract_text)]], + } + for row in df.loc[df.doc_id == id_].itertuples(): uid += 1 if row.protein_or_nucleotide_sequence_mutation == "PSM": ent_type = "Protein Sequence Mutation" else: ent_type = "Nucleotide Sequence Mutation" + tag_start, tag_end = int(row.off1), int(row.off2) + if tag_start > len(article_title): + tag_start -= 1 + tag_end -= 1 elist.append( { "id": str(uid), "type": ent_type, "text": [row.covered_text], - "offsets": [[int(row.off1), int(row.off2)]], + "offsets": [[tag_start, tag_end]], "normalized": [{"db_name": "dbSNP", "db_id": row.dbSNP_id}], } ) @@ -280,12 +273,8 @@ def _generate_examples(self, filepath: str, split: str) -> Tuple[int, Dict]: "id": id_, # uid is an unique identifier for every record that starts from 1 "document_id": str(row[0]), "entities": elist, - "passages": [], + "passages": [passage], "events": [], "coreferences": [], "relations": [], } - - -# This template is based on the following template from the datasets package: -# https://github.com/huggingface/datasets/blob/master/templates/new_dataset_script.py