diff --git a/biodatasets/s800/s800.py b/biodatasets/s800/s800.py new file mode 100644 index 00000000..73f240ed --- /dev/null +++ b/biodatasets/s800/s800.py @@ -0,0 +1,254 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +S800 Corpus: a novel abstract-based manually annotated corpus for Named Entity Recognition. +S800 comprises 800 PubMed abstracts in which organism mentions were identified and mapped to the corresponding NCBI Taxonomy identifiers. + +To increase the corpus taxonomic mention diversity the S800 abstracts were collected by selecting 100 abstracts from the following 8 categories: bacteriology, botany, entomology, medicine, mycology, protistology, virology and zoology. +S800 has been annotated with a focus at the species level; however, higher taxa mentions (such as genera, families and orders) have also been considered. +""" + +import os +from pathlib import Path +from typing import Any, List, Tuple, Dict + +import datasets +import pandas as pd +from bigbio.utils import schemas +from bigbio.utils.configs import BigBioConfig +from bigbio.utils.constants import Tasks + +_CITATION = """\ +@article{, + title = {The SPECIES and ORGANISMS Resources for Fast and Accurate Identification of Taxonomic Names in Text}, + author = {Pafilis, Evangelos AND Frankild, Sune P. AND Fanini, Lucia AND Faulwetter, Sarah AND Pavloudi, Christina AND Vasileiadou, Aikaterini AND Arvanitidis, Christos AND Jensen, Lars Juhl}, + journal = {PLOS ONE}, + publisher = {Public Library of Science}, + year = {2013}, + month = {06}, + volume = {8}, + pages = {1-6}, + number = {6}, + url = {https://doi.org/10.1371/journal.pone.0065390}, + doi = {10.1371/journal.pone.0065390}, + biburl = {https://journals.plos.org/plosone/article/citation/bibtex?id=10.1371/journal.pone.0065390}, + bibsource = {https://journals.plos.org/plosone/article/citation?id=10.1371/journal.pone.0065390} +} +""" + +_DATASETNAME = "s800" + +_DESCRIPTION = """\ +S800 Corpus: a novel abstract-based manually annotated corpus. +S800 comprises 800 PubMed abstracts in which organism mentions were identified and mapped to the corresponding NCBI Taxonomy identifiers. + +To increase the corpus taxonomic mention diversity the S800 abstracts were collected by selecting 100 abstracts from the following 8 categories: bacteriology, botany, entomology, medicine, mycology, protistology, virology and zoology. +S800 has been annotated with a focus at the species level; however, higher taxa mentions (such as genera, families and orders) have also been considered. +""" + +_HOMEPAGE = "https://species.jensenlab.org/" + +_LICENSE = "Creative Commons License Attribution-ShareAlike 4.0 International" + +_URLS = { + _DATASETNAME: "https://species.jensenlab.org/files/S800-1.0.tar.gz", +} + +_SUPPORTED_TASKS = [Tasks.NAMED_ENTITY_RECOGNITION] + +_SOURCE_VERSION = "1.0.0" + +_BIGBIO_VERSION = "1.0.0" + + +class S800Dataset(datasets.GeneratorBasedBuilder): + """S800 comprises 800 PubMed abstracts in which organism mentions were identified and mapped to the corresponding NCBI Taxonomy identifiers.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + BIGBIO_VERSION = datasets.Version(_BIGBIO_VERSION) + + BUILDER_CONFIGS = [ + BigBioConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=f"{_DATASETNAME}", + ), + BigBioConfig( + name=f"{_DATASETNAME}_bigbio_kb", + version=BIGBIO_VERSION, + description=f"{_DATASETNAME} BigBio schema", + schema="bigbio_kb", + subset_id=f"{_DATASETNAME}", + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "doc_id": datasets.Value("string"), + "s800_doc_id": datasets.Value("string"), + "pmid": datasets.Value("string"), + "entities": { + "offsets": [datasets.Value("int64")], + "text": datasets.Value("string"), + "ncbi_txid": datasets.Value("string"), + }, + "category": datasets.Value("string"), + "category_id": datasets.Value("int64"), + "journal": datasets.Value("string"), + "text": datasets.Value("string"), + } + ) + + elif self.config.schema == "bigbio_kb": + features = schemas.kb_features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + urls = _URLS[_DATASETNAME] + data_dir = dl_manager.download_and_extract(urls) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + # Whatever you put in gen_kwargs will be passed to _generate_examples + gen_kwargs={ + "data_dir": Path(data_dir), + "split": "train", + }, + ), + ] + + def _generate_examples(self, data_dir: Path, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + if self.config.schema == "source": + for key, example in self._read_example_from_file(data_dir): + yield key, example + + elif self.config.schema == "bigbio_kb": + for key, example in self._read_example_from_file_in_kb_schema(data_dir): + yield key, example + + def _read_example_from_file(self, data_dir: Path) -> Tuple[str, Dict]: + abstract_dir = data_dir / "abstracts" + df_s800 = pd.read_csv( + data_dir / "S800.tsv", + sep="\t", + header=None, + names=["nbci_taxonomy_id", "doc_id", "start", "end", "phrase"], + ).assign( + ncbi_txid=lambda dft: dft["nbci_taxonomy_id"].apply( + lambda x: f"NCBI:txid{x}" + ) + ) + + df_pubmed = pd.read_csv( + data_dir / "pubmedid.tsv", + sep="\t", + header=None, + names=["s800_doc_id", "pmid", "category", "category_id", "journal"], + ) + + df = ( + df_s800.groupby("doc_id") + .agg(list) + .reset_index() + .merge( + df_pubmed.assign( + doc_id=lambda dft: ( + dft["s800_doc_id"] + ":" + dft["pmid"] + ).str.replace("PMID:", "") + ), + on="doc_id", + how="left", + ) + ) + for _, row in df.iterrows(): + key = row.doc_id + entities = [ + dict(offsets=[s, e], text=p, ncbi_txid=ncbi_txid) + for s, e, p, ncbi_txid in zip( + row.start, row.end, row.phrase, row.ncbi_txid + ) + ] + doc_abstract_path = abstract_dir / f"{row.s800_doc_id}.txt" + with open(doc_abstract_path, encoding="utf-8") as fp: + text = fp.read() + example = { + "doc_id": key, + "s800_doc_id": row.s800_doc_id, + "pmid": row.pmid, + "entities": entities, + "category": row.category, + "category_id": row.category_id, + "journal": row.journal, + "text": text, + } + yield key, example + + def _parse_example_to_kb_schema(self, example) -> Dict[str, Any]: + text = example["text"] + doc_id = example["doc_id"] + passages = [ + { + "id": f"{doc_id}-P0", + "type": "abstract", + "text": [text], + "offsets": [[0, len(text)]], + } + ] + entities = [] + for i, entity in enumerate(example["entities"]): + cs, ce = entity["offsets"] + ce = ce + 1 # Add 1 to make the offset exclusive + entity = { + "id": f"{doc_id}-E{i}", + "text": [entity["text"]], + "offsets": [[cs, ce]], + "type": "species", + "normalized": [ + {"db_id": entity["ncbi_txid"], "db_name": "NBCI Taxonomy"} + ], + } + entities.append(entity) + data = { + "id": doc_id, + "document_id": doc_id, + "passages": passages, + "entities": entities, + "relations": [], + "events": [], + "coreferences": [], + } + return data + + def _read_example_from_file_in_kb_schema(self, data_dir: Path) -> Tuple[str, Dict]: + for key, example in self._read_example_from_file(data_dir): + example = self._parse_example_to_kb_schema(example) + yield key, example