diff --git a/bigbio/hub/hub_repos/mantra_gsc/README.md b/bigbio/hub/hub_repos/mantra_gsc/README.md new file mode 100644 index 00000000..0119c8a9 --- /dev/null +++ b/bigbio/hub/hub_repos/mantra_gsc/README.md @@ -0,0 +1,63 @@ +--- +language: + - en, fr, de, nl, es +bigbio_language: + - English, French, German, Dutch, Spanish +license: gpl-3.0 +bigbio_license_shortname: GPL_3p0_ONLY +multilinguality: multilingual +pretty_name: MantraGSC +homepage: https://github.com/mi-erasmusmc/Mantra-Gold-Standard-Corpus +bigbio_pubmed: true +bigbio_public: true +bigbio_tasks: + - NAMED_ENTITY_RECOGNITION + - NAMED_ENTITY_DISAMBIGUATION +--- + + +# Dataset Card for Mantra GSC + +## Dataset Description + +- **Homepage:** https://github.com/mi-erasmusmc/Mantra-Gold-Standard-Corpus +- **Pubmed:** True +- **Public:** True +- **Tasks:** NER, NED + +We selected text units from different parallel corpora (Medline abstract titles, drug labels, biomedical patent claims) in English, French, German, Spanish, and Dutch. Three annotators per language independently annotated the biomedical concepts, based on a subset of the Unified Medical Language System and covering a wide range of semantic groups. + +## Citation Information + +``` +@article{10.1093/jamia/ocv037, + author = {Kors, Jan A and Clematide, Simon and Akhondi, + Saber A and van Mulligen, Erik M and Rebholz-Schuhmann, Dietrich}, + title = "{A multilingual gold-standard corpus for biomedical concept recognition: the Mantra GSC}", + journal = {Journal of the American Medical Informatics Association}, + volume = {22}, + number = {5}, + pages = {948-956}, + year = {2015}, + month = {05}, + abstract = "{Objective To create a multilingual gold-standard corpus for biomedical concept recognition.Materials + and methods We selected text units from different parallel corpora (Medline abstract titles, drug labels, + biomedical patent claims) in English, French, German, Spanish, and Dutch. Three annotators per language + independently annotated the biomedical concepts, based on a subset of the Unified Medical Language System and + covering a wide range of semantic groups. To reduce the annotation workload, automatically generated + preannotations were provided. Individual annotations were automatically harmonized and then adjudicated, and + cross-language consistency checks were carried out to arrive at the final annotations.Results The number of final + annotations was 5530. Inter-annotator agreement scores indicate good agreement (median F-score 0.79), and are + similar to those between individual annotators and the gold standard. The automatically generated harmonized + annotation set for each language performed equally well as the best annotator for that language.Discussion The use + of automatic preannotations, harmonized annotations, and parallel corpora helped to keep the manual annotation + efforts manageable. The inter-annotator agreement scores provide a reference standard for gauging the performance + of automatic annotation techniques.Conclusion To our knowledge, this is the first gold-standard corpus for + biomedical concept recognition in languages other than English. Other distinguishing features are the wide variety + of semantic groups that are being covered, and the diversity of text genres that were annotated.}", + issn = {1067-5027}, + doi = {10.1093/jamia/ocv037}, + url = {https://doi.org/10.1093/jamia/ocv037}, + eprint = {https://academic.oup.com/jamia/article-pdf/22/5/948/34146393/ocv037.pdf}, +} +``` diff --git a/bigbio/hub/hub_repos/mantra_gsc/bigbiohub.py b/bigbio/hub/hub_repos/mantra_gsc/bigbiohub.py new file mode 100644 index 00000000..f4da7bb7 --- /dev/null +++ b/bigbio/hub/hub_repos/mantra_gsc/bigbiohub.py @@ -0,0 +1,590 @@ +from collections import defaultdict +from dataclasses import dataclass +from enum import Enum +import logging +from pathlib import Path +from types import SimpleNamespace +from typing import TYPE_CHECKING, Dict, Iterable, List, Tuple + +import datasets + +if TYPE_CHECKING: + import bioc + +logger = logging.getLogger(__name__) + + +BigBioValues = SimpleNamespace(NULL="") + + +@dataclass +class BigBioConfig(datasets.BuilderConfig): + """BuilderConfig for BigBio.""" + + name: str = None + version: datasets.Version = None + description: str = None + schema: str = None + subset_id: str = None + + +class Tasks(Enum): + NAMED_ENTITY_RECOGNITION = "NER" + NAMED_ENTITY_DISAMBIGUATION = "NED" + EVENT_EXTRACTION = "EE" + RELATION_EXTRACTION = "RE" + COREFERENCE_RESOLUTION = "COREF" + QUESTION_ANSWERING = "QA" + TEXTUAL_ENTAILMENT = "TE" + SEMANTIC_SIMILARITY = "STS" + TEXT_PAIRS_CLASSIFICATION = "TXT2CLASS" + PARAPHRASING = "PARA" + TRANSLATION = "TRANSL" + SUMMARIZATION = "SUM" + TEXT_CLASSIFICATION = "TXTCLASS" + + +entailment_features = datasets.Features( + { + "id": datasets.Value("string"), + "premise": datasets.Value("string"), + "hypothesis": datasets.Value("string"), + "label": datasets.Value("string"), + } +) + +pairs_features = datasets.Features( + { + "id": datasets.Value("string"), + "document_id": datasets.Value("string"), + "text_1": datasets.Value("string"), + "text_2": datasets.Value("string"), + "label": datasets.Value("string"), + } +) + +qa_features = datasets.Features( + { + "id": datasets.Value("string"), + "question_id": datasets.Value("string"), + "document_id": datasets.Value("string"), + "question": datasets.Value("string"), + "type": datasets.Value("string"), + "choices": [datasets.Value("string")], + "context": datasets.Value("string"), + "answer": datasets.Sequence(datasets.Value("string")), + } +) + +text_features = datasets.Features( + { + "id": datasets.Value("string"), + "document_id": datasets.Value("string"), + "text": datasets.Value("string"), + "labels": [datasets.Value("string")], + } +) + +text2text_features = datasets.Features( + { + "id": datasets.Value("string"), + "document_id": datasets.Value("string"), + "text_1": datasets.Value("string"), + "text_2": datasets.Value("string"), + "text_1_name": datasets.Value("string"), + "text_2_name": datasets.Value("string"), + } +) + +kb_features = datasets.Features( + { + "id": datasets.Value("string"), + "document_id": datasets.Value("string"), + "passages": [ + { + "id": datasets.Value("string"), + "type": datasets.Value("string"), + "text": datasets.Sequence(datasets.Value("string")), + "offsets": datasets.Sequence([datasets.Value("int32")]), + } + ], + "entities": [ + { + "id": datasets.Value("string"), + "type": datasets.Value("string"), + "text": datasets.Sequence(datasets.Value("string")), + "offsets": datasets.Sequence([datasets.Value("int32")]), + "normalized": [ + { + "db_name": datasets.Value("string"), + "db_id": datasets.Value("string"), + } + ], + } + ], + "events": [ + { + "id": datasets.Value("string"), + "type": datasets.Value("string"), + # refers to the text_bound_annotation of the trigger + "trigger": { + "text": datasets.Sequence(datasets.Value("string")), + "offsets": datasets.Sequence([datasets.Value("int32")]), + }, + "arguments": [ + { + "role": datasets.Value("string"), + "ref_id": datasets.Value("string"), + } + ], + } + ], + "coreferences": [ + { + "id": datasets.Value("string"), + "entity_ids": datasets.Sequence(datasets.Value("string")), + } + ], + "relations": [ + { + "id": datasets.Value("string"), + "type": datasets.Value("string"), + "arg1_id": datasets.Value("string"), + "arg2_id": datasets.Value("string"), + "normalized": [ + { + "db_name": datasets.Value("string"), + "db_id": datasets.Value("string"), + } + ], + } + ], + } +) + + +TASK_TO_SCHEMA = { + Tasks.NAMED_ENTITY_RECOGNITION.name: "KB", + Tasks.NAMED_ENTITY_DISAMBIGUATION.name: "KB", + Tasks.EVENT_EXTRACTION.name: "KB", + Tasks.RELATION_EXTRACTION.name: "KB", + Tasks.COREFERENCE_RESOLUTION.name: "KB", + Tasks.QUESTION_ANSWERING.name: "QA", + Tasks.TEXTUAL_ENTAILMENT.name: "TE", + Tasks.SEMANTIC_SIMILARITY.name: "PAIRS", + Tasks.TEXT_PAIRS_CLASSIFICATION.name: "PAIRS", + Tasks.PARAPHRASING.name: "T2T", + Tasks.TRANSLATION.name: "T2T", + Tasks.SUMMARIZATION.name: "T2T", + Tasks.TEXT_CLASSIFICATION.name: "TEXT", +} + +SCHEMA_TO_TASKS = defaultdict(set) +for task, schema in TASK_TO_SCHEMA.items(): + SCHEMA_TO_TASKS[schema].add(task) +SCHEMA_TO_TASKS = dict(SCHEMA_TO_TASKS) + +VALID_TASKS = set(TASK_TO_SCHEMA.keys()) +VALID_SCHEMAS = set(TASK_TO_SCHEMA.values()) + +SCHEMA_TO_FEATURES = { + "KB": kb_features, + "QA": qa_features, + "TE": entailment_features, + "T2T": text2text_features, + "TEXT": text_features, + "PAIRS": pairs_features, +} + + +def get_texts_and_offsets_from_bioc_ann(ann: "bioc.BioCAnnotation") -> Tuple: + + offsets = [(loc.offset, loc.offset + loc.length) for loc in ann.locations] + + text = ann.text + + if len(offsets) > 1: + i = 0 + texts = [] + for start, end in offsets: + chunk_len = end - start + texts.append(text[i : chunk_len + i]) + i += chunk_len + while i < len(text) and text[i] == " ": + i += 1 + else: + texts = [text] + + return offsets, texts + + +def remove_prefix(a: str, prefix: str) -> str: + if a.startswith(prefix): + a = a[len(prefix) :] + return a + + +def parse_brat_file( + txt_file: Path, + annotation_file_suffixes: List[str] = None, + parse_notes: bool = False, +) -> Dict: + """ + Parse a brat file into the schema defined below. + `txt_file` should be the path to the brat '.txt' file you want to parse, e.g. 'data/1234.txt' + Assumes that the annotations are contained in one or more of the corresponding '.a1', '.a2' or '.ann' files, + e.g. 'data/1234.ann' or 'data/1234.a1' and 'data/1234.a2'. + Will include annotator notes, when `parse_notes == True`. + brat_features = datasets.Features( + { + "id": datasets.Value("string"), + "document_id": datasets.Value("string"), + "text": datasets.Value("string"), + "text_bound_annotations": [ # T line in brat, e.g. type or event trigger + { + "offsets": datasets.Sequence([datasets.Value("int32")]), + "text": datasets.Sequence(datasets.Value("string")), + "type": datasets.Value("string"), + "id": datasets.Value("string"), + } + ], + "events": [ # E line in brat + { + "trigger": datasets.Value( + "string" + ), # refers to the text_bound_annotation of the trigger, + "id": datasets.Value("string"), + "type": datasets.Value("string"), + "arguments": datasets.Sequence( + { + "role": datasets.Value("string"), + "ref_id": datasets.Value("string"), + } + ), + } + ], + "relations": [ # R line in brat + { + "id": datasets.Value("string"), + "head": { + "ref_id": datasets.Value("string"), + "role": datasets.Value("string"), + }, + "tail": { + "ref_id": datasets.Value("string"), + "role": datasets.Value("string"), + }, + "type": datasets.Value("string"), + } + ], + "equivalences": [ # Equiv line in brat + { + "id": datasets.Value("string"), + "ref_ids": datasets.Sequence(datasets.Value("string")), + } + ], + "attributes": [ # M or A lines in brat + { + "id": datasets.Value("string"), + "type": datasets.Value("string"), + "ref_id": datasets.Value("string"), + "value": datasets.Value("string"), + } + ], + "normalizations": [ # N lines in brat + { + "id": datasets.Value("string"), + "type": datasets.Value("string"), + "ref_id": datasets.Value("string"), + "resource_name": datasets.Value( + "string" + ), # Name of the resource, e.g. "Wikipedia" + "cuid": datasets.Value( + "string" + ), # ID in the resource, e.g. 534366 + "text": datasets.Value( + "string" + ), # Human readable description/name of the entity, e.g. "Barack Obama" + } + ], + ### OPTIONAL: Only included when `parse_notes == True` + "notes": [ # # lines in brat + { + "id": datasets.Value("string"), + "type": datasets.Value("string"), + "ref_id": datasets.Value("string"), + "text": datasets.Value("string"), + } + ], + }, + ) + """ + + example = {} + example["document_id"] = txt_file.with_suffix("").name + with txt_file.open() as f: + example["text"] = f.read() + + # If no specific suffixes of the to-be-read annotation files are given - take standard suffixes + # for event extraction + if annotation_file_suffixes is None: + annotation_file_suffixes = [".a1", ".a2", ".ann"] + + if len(annotation_file_suffixes) == 0: + raise AssertionError( + "At least one suffix for the to-be-read annotation files should be given!" + ) + + ann_lines = [] + for suffix in annotation_file_suffixes: + annotation_file = txt_file.with_suffix(suffix) + if annotation_file.exists(): + with annotation_file.open() as f: + ann_lines.extend(f.readlines()) + + example["text_bound_annotations"] = [] + example["events"] = [] + example["relations"] = [] + example["equivalences"] = [] + example["attributes"] = [] + example["normalizations"] = [] + + if parse_notes: + example["notes"] = [] + + for line in ann_lines: + line = line.strip() + if not line: + continue + + if line.startswith("T"): # Text bound + ann = {} + fields = line.split("\t") + + ann["id"] = fields[0] + ann["type"] = fields[1].split()[0] + ann["offsets"] = [] + span_str = remove_prefix(fields[1], (ann["type"] + " ")) + text = fields[2] + for span in span_str.split(";"): + start, end = span.split() + ann["offsets"].append([int(start), int(end)]) + + # Heuristically split text of discontiguous entities into chunks + ann["text"] = [] + if len(ann["offsets"]) > 1: + i = 0 + for start, end in ann["offsets"]: + chunk_len = end - start + ann["text"].append(text[i : chunk_len + i]) + i += chunk_len + while i < len(text) and text[i] == " ": + i += 1 + else: + ann["text"] = [text] + + example["text_bound_annotations"].append(ann) + + elif line.startswith("E"): + ann = {} + fields = line.split("\t") + + ann["id"] = fields[0] + + ann["type"], ann["trigger"] = fields[1].split()[0].split(":") + + ann["arguments"] = [] + for role_ref_id in fields[1].split()[1:]: + argument = { + "role": (role_ref_id.split(":"))[0], + "ref_id": (role_ref_id.split(":"))[1], + } + ann["arguments"].append(argument) + + example["events"].append(ann) + + elif line.startswith("R"): + ann = {} + fields = line.split("\t") + + ann["id"] = fields[0] + ann["type"] = fields[1].split()[0] + + ann["head"] = { + "role": fields[1].split()[1].split(":")[0], + "ref_id": fields[1].split()[1].split(":")[1], + } + ann["tail"] = { + "role": fields[1].split()[2].split(":")[0], + "ref_id": fields[1].split()[2].split(":")[1], + } + + example["relations"].append(ann) + + # '*' seems to be the legacy way to mark equivalences, + # but I couldn't find any info on the current way + # this might have to be adapted dependent on the brat version + # of the annotation + elif line.startswith("*"): + ann = {} + fields = line.split("\t") + + ann["id"] = fields[0] + ann["ref_ids"] = fields[1].split()[1:] + + example["equivalences"].append(ann) + + elif line.startswith("A") or line.startswith("M"): + ann = {} + fields = line.split("\t") + + ann["id"] = fields[0] + + info = fields[1].split() + ann["type"] = info[0] + ann["ref_id"] = info[1] + + if len(info) > 2: + ann["value"] = info[2] + else: + ann["value"] = "" + + example["attributes"].append(ann) + + elif line.startswith("N"): + ann = {} + fields = line.split("\t") + + ann["id"] = fields[0] + ann["text"] = fields[2] + + info = fields[1].split() + + ann["type"] = info[0] + ann["ref_id"] = info[1] + ann["resource_name"] = info[2].split(":")[0] + ann["cuid"] = info[2].split(":")[1] + example["normalizations"].append(ann) + + elif parse_notes and line.startswith("#"): + ann = {} + fields = line.split("\t") + + ann["id"] = fields[0] + ann["text"] = fields[2] if len(fields) == 3 else BigBioValues.NULL + + info = fields[1].split() + + ann["type"] = info[0] + ann["ref_id"] = info[1] + example["notes"].append(ann) + + return example + + +def brat_parse_to_bigbio_kb(brat_parse: Dict) -> Dict: + """ + Transform a brat parse (conforming to the standard brat schema) obtained with + `parse_brat_file` into a dictionary conforming to the `bigbio-kb` schema (as defined in ../schemas/kb.py) + :param brat_parse: + """ + + unified_example = {} + + # Prefix all ids with document id to ensure global uniqueness, + # because brat ids are only unique within their document + id_prefix = brat_parse["document_id"] + "_" + + # identical + unified_example["document_id"] = brat_parse["document_id"] + unified_example["passages"] = [ + { + "id": id_prefix + "_text", + "type": "abstract", + "text": [brat_parse["text"]], + "offsets": [[0, len(brat_parse["text"])]], + } + ] + + # get normalizations + ref_id_to_normalizations = defaultdict(list) + for normalization in brat_parse["normalizations"]: + ref_id_to_normalizations[normalization["ref_id"]].append( + { + "db_name": normalization["resource_name"], + "db_id": normalization["cuid"], + } + ) + + # separate entities and event triggers + unified_example["events"] = [] + non_event_ann = brat_parse["text_bound_annotations"].copy() + for event in brat_parse["events"]: + event = event.copy() + event["id"] = id_prefix + event["id"] + trigger = next( + tr + for tr in brat_parse["text_bound_annotations"] + if tr["id"] == event["trigger"] + ) + if trigger in non_event_ann: + non_event_ann.remove(trigger) + event["trigger"] = { + "text": trigger["text"].copy(), + "offsets": trigger["offsets"].copy(), + } + for argument in event["arguments"]: + argument["ref_id"] = id_prefix + argument["ref_id"] + + unified_example["events"].append(event) + + unified_example["entities"] = [] + anno_ids = [ref_id["id"] for ref_id in non_event_ann] + for ann in non_event_ann: + entity_ann = ann.copy() + entity_ann["id"] = id_prefix + entity_ann["id"] + entity_ann["normalized"] = ref_id_to_normalizations[ann["id"]] + unified_example["entities"].append(entity_ann) + + # massage relations + unified_example["relations"] = [] + skipped_relations = set() + for ann in brat_parse["relations"]: + if ( + ann["head"]["ref_id"] not in anno_ids + or ann["tail"]["ref_id"] not in anno_ids + ): + skipped_relations.add(ann["id"]) + continue + unified_example["relations"].append( + { + "arg1_id": id_prefix + ann["head"]["ref_id"], + "arg2_id": id_prefix + ann["tail"]["ref_id"], + "id": id_prefix + ann["id"], + "type": ann["type"], + "normalized": [], + } + ) + if len(skipped_relations) > 0: + example_id = brat_parse["document_id"] + logger.info( + f"Example:{example_id}: The `bigbio_kb` schema allows `relations` only between entities." + f" Skip (for now): " + f"{list(skipped_relations)}" + ) + + # get coreferences + unified_example["coreferences"] = [] + for i, ann in enumerate(brat_parse["equivalences"], start=1): + is_entity_cluster = True + for ref_id in ann["ref_ids"]: + if not ref_id.startswith("T"): # not textbound -> no entity + is_entity_cluster = False + elif ref_id not in anno_ids: # event trigger -> no entity + is_entity_cluster = False + if is_entity_cluster: + entity_ids = [id_prefix + i for i in ann["ref_ids"]] + unified_example["coreferences"].append( + {"id": id_prefix + str(i), "entity_ids": entity_ids} + ) + return unified_example diff --git a/bigbio/hub/hub_repos/mantra_gsc/mantra_gsc.py b/bigbio/hub/hub_repos/mantra_gsc/mantra_gsc.py new file mode 100644 index 00000000..c92af913 --- /dev/null +++ b/bigbio/hub/hub_repos/mantra_gsc/mantra_gsc.py @@ -0,0 +1,298 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import ast +from itertools import product +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from .bigbiohub import (BigBioConfig, Tasks, brat_parse_to_bigbio_kb, + kb_features, parse_brat_file) + +_LANGUAGES = ["English", "French", "German", "Dutch", "Spanish"] + +_LOCAL = False +_PUBMED = True + +_CITATION = """\ +@article{10.1093/jamia/ocv037, + author = {Kors, Jan A and Clematide, Simon and Akhondi, + Saber A and van Mulligen, Erik M and Rebholz-Schuhmann, Dietrich}, + title = "{A multilingual gold-standard corpus for biomedical concept recognition: the Mantra GSC}", + journal = {Journal of the American Medical Informatics Association}, + volume = {22}, + number = {5}, + pages = {948-956}, + year = {2015}, + month = {05}, + abstract = "{Objective To create a multilingual gold-standard corpus for biomedical concept recognition.Materials + and methods We selected text units from different parallel corpora (Medline abstract titles, drug labels, + biomedical patent claims) in English, French, German, Spanish, and Dutch. Three annotators per language + independently annotated the biomedical concepts, based on a subset of the Unified Medical Language System and + covering a wide range of semantic groups. To reduce the annotation workload, automatically generated + preannotations were provided. Individual annotations were automatically harmonized and then adjudicated, and + cross-language consistency checks were carried out to arrive at the final annotations.Results The number of final + annotations was 5530. Inter-annotator agreement scores indicate good agreement (median F-score 0.79), and are + similar to those between individual annotators and the gold standard. The automatically generated harmonized + annotation set for each language performed equally well as the best annotator for that language.Discussion The use + of automatic preannotations, harmonized annotations, and parallel corpora helped to keep the manual annotation + efforts manageable. The inter-annotator agreement scores provide a reference standard for gauging the performance + of automatic annotation techniques.Conclusion To our knowledge, this is the first gold-standard corpus for + biomedical concept recognition in languages other than English. Other distinguishing features are the wide variety + of semantic groups that are being covered, and the diversity of text genres that were annotated.}", + issn = {1067-5027}, + doi = {10.1093/jamia/ocv037}, + url = {https://doi.org/10.1093/jamia/ocv037}, + eprint = {https://academic.oup.com/jamia/article-pdf/22/5/948/34146393/ocv037.pdf}, +} +""" + +_DATASETNAME = "mantra_gsc" +_DISPLAYNAME = "Mantra GSC" + +_DESCRIPTION = """\ +We selected text units from different parallel corpora (Medline abstract titles, drug labels, biomedical patent claims) +in English, French, German, Spanish, and Dutch. Three annotators per language independently annotated the biomedical +concepts, based on a subset of the Unified Medical Language System and covering a wide range of semantic groups. +""" + +_HOMEPAGE = "https://github.com/mi-erasmusmc/Mantra-Gold-Standard-Corpus" + +_LICENSE = "GPL_3p0_ONLY" + +_URLS = { + _DATASETNAME: "https://github.com/mi-erasmusmc/Mantra-Gold-Standard-Corpus/raw/main/Mantra-GSC-brat.zip", +} + +_SUPPORTED_TASKS = [Tasks.NAMED_ENTITY_RECOGNITION, Tasks.NAMED_ENTITY_DISAMBIGUATION] + +_SOURCE_VERSION = "1.0.0" + +_BIGBIO_VERSION = "1.0.0" + +_LANGUAGES_2 = { + "es": "Spanish", + "fr": "French", + "de": "German", + "nl": "Dutch", + "en": "English", +} + +_DATASET_TYPES = { + "emea": "EMEA", + "medline": "Medline", + "patents": "Patents", +} + + +class MantraGSCDataset(datasets.GeneratorBasedBuilder): + """Mantra Gold Standard Corpus (GSC) dataset.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + BIGBIO_VERSION = datasets.Version(_BIGBIO_VERSION) + + BUILDER_CONFIGS = [] + + for language, dataset_type in product(_LANGUAGES_2, _DATASET_TYPES): + if dataset_type == "patents" and language in ["nl", "es"]: + continue + + BUILDER_CONFIGS.append( + BigBioConfig( + name=f"mantra_gsc_{language}_{dataset_type}_source", + version=SOURCE_VERSION, + description=f"Mantra GSC {_LANGUAGES_2[language]} {_DATASET_TYPES[dataset_type]} source schema", + schema="source", + subset_id=f"mantra_gsc_{language}_{_DATASET_TYPES[dataset_type]}", + ) + ) + BUILDER_CONFIGS.append( + BigBioConfig( + name=f"mantra_gsc_{language}_{dataset_type}_bigbio_kb", + version=SOURCE_VERSION, + description=f"Mantra GSC {_LANGUAGES_2[language]} {_DATASET_TYPES[dataset_type]} BigBio schema", + schema="bigbio_kb", + subset_id=f"mantra_gsc_{language}_{_DATASET_TYPES[dataset_type]}", + ) + ) + + DEFAULT_CONFIG_NAME = "mantra_gsc_en_medline_source" + + def _info(self) -> datasets.DatasetInfo: + + if self.config.schema == "source": + features = datasets.Features( + { + "document_id": datasets.Value("string"), + "text": datasets.Value("string"), + "entities": [ + { + "entity_id": datasets.Value("string"), + "type": datasets.Value("string"), + "offsets": datasets.Sequence([datasets.Value("int32")]), + "text": datasets.Sequence(datasets.Value("string")), + "cui": datasets.Value("string"), + "preferred_term": datasets.Value("string"), + "semantic_type": datasets.Value("string"), + "normalized": [ + { + "db_name": datasets.Value("string"), + "db_id": datasets.Value("string"), + } + ], + } + ], + } + ) + + elif self.config.schema == "bigbio_kb": + features = kb_features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=str(_LICENSE), + citation=_CITATION, + ) + + def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + + urls = _URLS[_DATASETNAME] + data_dir = dl_manager.download_and_extract(urls) + data_dir = Path(data_dir) / "Mantra-GSC" + + language, dataset_type = self.config.name.split("_")[2:4] + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "data_dir": data_dir, + "language": language, + "dataset_type": dataset_type, + }, + ), + ] + + def _generate_examples(self, data_dir: Path, language: str, dataset_type: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + data_dir = data_dir / f"{_LANGUAGES_2[language]}" + + if dataset_type in ["patents", "emea"]: + data_dir = data_dir / f"{_DATASET_TYPES[dataset_type]}_ec22-cui-best_man" + else: + # It is Medline now + if language != "en": + data_dir = data_dir / f"{_DATASET_TYPES[dataset_type]}_EN_{language.upper()}_ec22-cui-best_man" + else: + data_dir = [ + data_dir / f"{_DATASET_TYPES[dataset_type]}_EN_{_lang.upper()}_ec22-cui-best_man" + for _lang in _LANGUAGES_2 + if _lang != "en" + ] + + if not isinstance(data_dir, list): + data_dir: List[Path] = [data_dir] + + raw_files = [raw_file for _dir in data_dir for raw_file in _dir.glob("*.txt")] + + if self.config.schema == "source": + for i, raw_file in enumerate(raw_files): + brat_example = parse_brat_file(raw_file, parse_notes=True) + source_example = self._to_source_example(brat_example) + yield i, source_example + + elif self.config.schema == "bigbio_kb": + for i, raw_file in enumerate(raw_files): + brat_example = parse_brat_file(raw_file, parse_notes=True) + brat_to_bigbio_example = self._brat_to_bigbio_example(brat_example) + kb_example = brat_parse_to_bigbio_kb(brat_to_bigbio_example) + kb_example["id"] = i + yield i, kb_example + + def _to_source_example(self, brat_example: Dict) -> Dict: + source_example = { + "document_id": brat_example["document_id"], + "text": brat_example["text"], + } + + source_example["entities"] = [] + for entity_annotation, ann_notes in zip(brat_example["text_bound_annotations"], brat_example["notes"]): + entity_ann = entity_annotation.copy() + + # Change id property name + entity_ann["entity_id"] = entity_ann["id"] + entity_ann.pop("id") + + # Get values from annotator notes + assert entity_ann["entity_id"] == ann_notes["ref_id"] + notes_values = ast.literal_eval(ann_notes["text"]) + if len(notes_values) == 4: + cui, preferred_term, semantic_type, semantic_group = notes_values + else: + preferred_term, semantic_type, semantic_group = notes_values + cui = entity_ann["type"] + entity_ann["cui"] = cui + entity_ann["preferred_term"] = preferred_term + entity_ann["semantic_type"] = semantic_type + entity_ann["type"] = semantic_group + entity_ann["normalized"] = [{"db_name": "UMLS", "db_id": cui}] + + # Add entity annotation to sample + source_example["entities"].append(entity_ann) + + return source_example + + def _brat_to_bigbio_example(self, brat_example: Dict) -> Dict: + kb_example = { + "document_id": brat_example["document_id"], + # "unit_id": unit_id, + "text": brat_example["text"], + } + kb_example["text_bound_annotations"] = [] + kb_example["normalizations"] = [] + for entity_annotation, ann_notes in zip(brat_example["text_bound_annotations"], brat_example["notes"]): + entity_ann = entity_annotation.copy() + # Get values from annotator notes + assert entity_ann["id"] == ann_notes["ref_id"] + notes_values = ast.literal_eval(ann_notes["text"]) + if len(notes_values) == 4: + cui, _, _, semantic_group = notes_values + else: + _, _, semantic_group = notes_values + cui = entity_ann["type"] + entity_ann["type"] = semantic_group + kb_example["text_bound_annotations"].append(entity_ann) + kb_example["normalizations"].append( + { + "type": semantic_group, + "ref_id": entity_ann["id"], + "resource_name": "UMLS", + "cuid": cui, + "text": "", + } + ) + + kb_example["events"] = brat_example["events"] + kb_example["relations"] = brat_example["relations"] + kb_example["equivalences"] = brat_example["equivalences"] + kb_example["attributes"] = brat_example["attributes"] + kb_example["notes"] = brat_example["notes"] + + return kb_example diff --git a/bigbio/hub/hub_repos/symptemist/README.md b/bigbio/hub/hub_repos/symptemist/README.md new file mode 100644 index 00000000..ed3d9528 --- /dev/null +++ b/bigbio/hub/hub_repos/symptemist/README.md @@ -0,0 +1,39 @@ +--- +language: + - es +bigbio_language: + - Spanish +license: Creative Commons Attribution 4.0 International License +bigbio_license_shortname: CC_BY_4p0 +multilinguality: monolingual +pretty_name: SympTEMIST +homepage: https://temu.bsc.es/symptemist/ +bigbio_pubmed: false +bigbio_public: true +bigbio_tasks: + - NAMED_ENTITY_RECOGNITION + - NAMED_ENTITY_DISAMBIGUATION +--- + + +# Dataset Card for SympTEMIST + +## Dataset Description + +- **Homepage:** https://temu.bsc.es/symptemist/ +- **Pubmed:** False +- **Public:** True +- **Tasks:** NER,NED + +The SympTEMIST corpus is a collection of 1,000 clinical case reports in Spanish annotated with symptoms, signs and findings mentions and normalized to SNOMED CT. + +## Citation Information + +``` +@inproceedings{lima2023overview, + title={Overview of SympTEMIST at BioCreative VIII: corpus, guidelines and evaluation of systems for the detection and normalization of symptoms, signs and findings from text}, + author={Lima-L{\'o}pez, Salvador and Farr{\'e}-Maduell, Eul{\`a}lia and Gasco-S{\'a}nchez, Luis and Rodr{\'\i}guez-Miret, Jan and Krallinger, Martin}, + booktitle={Proceedings of the BioCreative VIII Challenge and Workshop: Curation and Evaluation in the era of Generative Models}, + year={2023} +} +``` diff --git a/bigbio/hub/hub_repos/symptemist/bigbiohub.py b/bigbio/hub/hub_repos/symptemist/bigbiohub.py new file mode 100644 index 00000000..f4da7bb7 --- /dev/null +++ b/bigbio/hub/hub_repos/symptemist/bigbiohub.py @@ -0,0 +1,590 @@ +from collections import defaultdict +from dataclasses import dataclass +from enum import Enum +import logging +from pathlib import Path +from types import SimpleNamespace +from typing import TYPE_CHECKING, Dict, Iterable, List, Tuple + +import datasets + +if TYPE_CHECKING: + import bioc + +logger = logging.getLogger(__name__) + + +BigBioValues = SimpleNamespace(NULL="") + + +@dataclass +class BigBioConfig(datasets.BuilderConfig): + """BuilderConfig for BigBio.""" + + name: str = None + version: datasets.Version = None + description: str = None + schema: str = None + subset_id: str = None + + +class Tasks(Enum): + NAMED_ENTITY_RECOGNITION = "NER" + NAMED_ENTITY_DISAMBIGUATION = "NED" + EVENT_EXTRACTION = "EE" + RELATION_EXTRACTION = "RE" + COREFERENCE_RESOLUTION = "COREF" + QUESTION_ANSWERING = "QA" + TEXTUAL_ENTAILMENT = "TE" + SEMANTIC_SIMILARITY = "STS" + TEXT_PAIRS_CLASSIFICATION = "TXT2CLASS" + PARAPHRASING = "PARA" + TRANSLATION = "TRANSL" + SUMMARIZATION = "SUM" + TEXT_CLASSIFICATION = "TXTCLASS" + + +entailment_features = datasets.Features( + { + "id": datasets.Value("string"), + "premise": datasets.Value("string"), + "hypothesis": datasets.Value("string"), + "label": datasets.Value("string"), + } +) + +pairs_features = datasets.Features( + { + "id": datasets.Value("string"), + "document_id": datasets.Value("string"), + "text_1": datasets.Value("string"), + "text_2": datasets.Value("string"), + "label": datasets.Value("string"), + } +) + +qa_features = datasets.Features( + { + "id": datasets.Value("string"), + "question_id": datasets.Value("string"), + "document_id": datasets.Value("string"), + "question": datasets.Value("string"), + "type": datasets.Value("string"), + "choices": [datasets.Value("string")], + "context": datasets.Value("string"), + "answer": datasets.Sequence(datasets.Value("string")), + } +) + +text_features = datasets.Features( + { + "id": datasets.Value("string"), + "document_id": datasets.Value("string"), + "text": datasets.Value("string"), + "labels": [datasets.Value("string")], + } +) + +text2text_features = datasets.Features( + { + "id": datasets.Value("string"), + "document_id": datasets.Value("string"), + "text_1": datasets.Value("string"), + "text_2": datasets.Value("string"), + "text_1_name": datasets.Value("string"), + "text_2_name": datasets.Value("string"), + } +) + +kb_features = datasets.Features( + { + "id": datasets.Value("string"), + "document_id": datasets.Value("string"), + "passages": [ + { + "id": datasets.Value("string"), + "type": datasets.Value("string"), + "text": datasets.Sequence(datasets.Value("string")), + "offsets": datasets.Sequence([datasets.Value("int32")]), + } + ], + "entities": [ + { + "id": datasets.Value("string"), + "type": datasets.Value("string"), + "text": datasets.Sequence(datasets.Value("string")), + "offsets": datasets.Sequence([datasets.Value("int32")]), + "normalized": [ + { + "db_name": datasets.Value("string"), + "db_id": datasets.Value("string"), + } + ], + } + ], + "events": [ + { + "id": datasets.Value("string"), + "type": datasets.Value("string"), + # refers to the text_bound_annotation of the trigger + "trigger": { + "text": datasets.Sequence(datasets.Value("string")), + "offsets": datasets.Sequence([datasets.Value("int32")]), + }, + "arguments": [ + { + "role": datasets.Value("string"), + "ref_id": datasets.Value("string"), + } + ], + } + ], + "coreferences": [ + { + "id": datasets.Value("string"), + "entity_ids": datasets.Sequence(datasets.Value("string")), + } + ], + "relations": [ + { + "id": datasets.Value("string"), + "type": datasets.Value("string"), + "arg1_id": datasets.Value("string"), + "arg2_id": datasets.Value("string"), + "normalized": [ + { + "db_name": datasets.Value("string"), + "db_id": datasets.Value("string"), + } + ], + } + ], + } +) + + +TASK_TO_SCHEMA = { + Tasks.NAMED_ENTITY_RECOGNITION.name: "KB", + Tasks.NAMED_ENTITY_DISAMBIGUATION.name: "KB", + Tasks.EVENT_EXTRACTION.name: "KB", + Tasks.RELATION_EXTRACTION.name: "KB", + Tasks.COREFERENCE_RESOLUTION.name: "KB", + Tasks.QUESTION_ANSWERING.name: "QA", + Tasks.TEXTUAL_ENTAILMENT.name: "TE", + Tasks.SEMANTIC_SIMILARITY.name: "PAIRS", + Tasks.TEXT_PAIRS_CLASSIFICATION.name: "PAIRS", + Tasks.PARAPHRASING.name: "T2T", + Tasks.TRANSLATION.name: "T2T", + Tasks.SUMMARIZATION.name: "T2T", + Tasks.TEXT_CLASSIFICATION.name: "TEXT", +} + +SCHEMA_TO_TASKS = defaultdict(set) +for task, schema in TASK_TO_SCHEMA.items(): + SCHEMA_TO_TASKS[schema].add(task) +SCHEMA_TO_TASKS = dict(SCHEMA_TO_TASKS) + +VALID_TASKS = set(TASK_TO_SCHEMA.keys()) +VALID_SCHEMAS = set(TASK_TO_SCHEMA.values()) + +SCHEMA_TO_FEATURES = { + "KB": kb_features, + "QA": qa_features, + "TE": entailment_features, + "T2T": text2text_features, + "TEXT": text_features, + "PAIRS": pairs_features, +} + + +def get_texts_and_offsets_from_bioc_ann(ann: "bioc.BioCAnnotation") -> Tuple: + + offsets = [(loc.offset, loc.offset + loc.length) for loc in ann.locations] + + text = ann.text + + if len(offsets) > 1: + i = 0 + texts = [] + for start, end in offsets: + chunk_len = end - start + texts.append(text[i : chunk_len + i]) + i += chunk_len + while i < len(text) and text[i] == " ": + i += 1 + else: + texts = [text] + + return offsets, texts + + +def remove_prefix(a: str, prefix: str) -> str: + if a.startswith(prefix): + a = a[len(prefix) :] + return a + + +def parse_brat_file( + txt_file: Path, + annotation_file_suffixes: List[str] = None, + parse_notes: bool = False, +) -> Dict: + """ + Parse a brat file into the schema defined below. + `txt_file` should be the path to the brat '.txt' file you want to parse, e.g. 'data/1234.txt' + Assumes that the annotations are contained in one or more of the corresponding '.a1', '.a2' or '.ann' files, + e.g. 'data/1234.ann' or 'data/1234.a1' and 'data/1234.a2'. + Will include annotator notes, when `parse_notes == True`. + brat_features = datasets.Features( + { + "id": datasets.Value("string"), + "document_id": datasets.Value("string"), + "text": datasets.Value("string"), + "text_bound_annotations": [ # T line in brat, e.g. type or event trigger + { + "offsets": datasets.Sequence([datasets.Value("int32")]), + "text": datasets.Sequence(datasets.Value("string")), + "type": datasets.Value("string"), + "id": datasets.Value("string"), + } + ], + "events": [ # E line in brat + { + "trigger": datasets.Value( + "string" + ), # refers to the text_bound_annotation of the trigger, + "id": datasets.Value("string"), + "type": datasets.Value("string"), + "arguments": datasets.Sequence( + { + "role": datasets.Value("string"), + "ref_id": datasets.Value("string"), + } + ), + } + ], + "relations": [ # R line in brat + { + "id": datasets.Value("string"), + "head": { + "ref_id": datasets.Value("string"), + "role": datasets.Value("string"), + }, + "tail": { + "ref_id": datasets.Value("string"), + "role": datasets.Value("string"), + }, + "type": datasets.Value("string"), + } + ], + "equivalences": [ # Equiv line in brat + { + "id": datasets.Value("string"), + "ref_ids": datasets.Sequence(datasets.Value("string")), + } + ], + "attributes": [ # M or A lines in brat + { + "id": datasets.Value("string"), + "type": datasets.Value("string"), + "ref_id": datasets.Value("string"), + "value": datasets.Value("string"), + } + ], + "normalizations": [ # N lines in brat + { + "id": datasets.Value("string"), + "type": datasets.Value("string"), + "ref_id": datasets.Value("string"), + "resource_name": datasets.Value( + "string" + ), # Name of the resource, e.g. "Wikipedia" + "cuid": datasets.Value( + "string" + ), # ID in the resource, e.g. 534366 + "text": datasets.Value( + "string" + ), # Human readable description/name of the entity, e.g. "Barack Obama" + } + ], + ### OPTIONAL: Only included when `parse_notes == True` + "notes": [ # # lines in brat + { + "id": datasets.Value("string"), + "type": datasets.Value("string"), + "ref_id": datasets.Value("string"), + "text": datasets.Value("string"), + } + ], + }, + ) + """ + + example = {} + example["document_id"] = txt_file.with_suffix("").name + with txt_file.open() as f: + example["text"] = f.read() + + # If no specific suffixes of the to-be-read annotation files are given - take standard suffixes + # for event extraction + if annotation_file_suffixes is None: + annotation_file_suffixes = [".a1", ".a2", ".ann"] + + if len(annotation_file_suffixes) == 0: + raise AssertionError( + "At least one suffix for the to-be-read annotation files should be given!" + ) + + ann_lines = [] + for suffix in annotation_file_suffixes: + annotation_file = txt_file.with_suffix(suffix) + if annotation_file.exists(): + with annotation_file.open() as f: + ann_lines.extend(f.readlines()) + + example["text_bound_annotations"] = [] + example["events"] = [] + example["relations"] = [] + example["equivalences"] = [] + example["attributes"] = [] + example["normalizations"] = [] + + if parse_notes: + example["notes"] = [] + + for line in ann_lines: + line = line.strip() + if not line: + continue + + if line.startswith("T"): # Text bound + ann = {} + fields = line.split("\t") + + ann["id"] = fields[0] + ann["type"] = fields[1].split()[0] + ann["offsets"] = [] + span_str = remove_prefix(fields[1], (ann["type"] + " ")) + text = fields[2] + for span in span_str.split(";"): + start, end = span.split() + ann["offsets"].append([int(start), int(end)]) + + # Heuristically split text of discontiguous entities into chunks + ann["text"] = [] + if len(ann["offsets"]) > 1: + i = 0 + for start, end in ann["offsets"]: + chunk_len = end - start + ann["text"].append(text[i : chunk_len + i]) + i += chunk_len + while i < len(text) and text[i] == " ": + i += 1 + else: + ann["text"] = [text] + + example["text_bound_annotations"].append(ann) + + elif line.startswith("E"): + ann = {} + fields = line.split("\t") + + ann["id"] = fields[0] + + ann["type"], ann["trigger"] = fields[1].split()[0].split(":") + + ann["arguments"] = [] + for role_ref_id in fields[1].split()[1:]: + argument = { + "role": (role_ref_id.split(":"))[0], + "ref_id": (role_ref_id.split(":"))[1], + } + ann["arguments"].append(argument) + + example["events"].append(ann) + + elif line.startswith("R"): + ann = {} + fields = line.split("\t") + + ann["id"] = fields[0] + ann["type"] = fields[1].split()[0] + + ann["head"] = { + "role": fields[1].split()[1].split(":")[0], + "ref_id": fields[1].split()[1].split(":")[1], + } + ann["tail"] = { + "role": fields[1].split()[2].split(":")[0], + "ref_id": fields[1].split()[2].split(":")[1], + } + + example["relations"].append(ann) + + # '*' seems to be the legacy way to mark equivalences, + # but I couldn't find any info on the current way + # this might have to be adapted dependent on the brat version + # of the annotation + elif line.startswith("*"): + ann = {} + fields = line.split("\t") + + ann["id"] = fields[0] + ann["ref_ids"] = fields[1].split()[1:] + + example["equivalences"].append(ann) + + elif line.startswith("A") or line.startswith("M"): + ann = {} + fields = line.split("\t") + + ann["id"] = fields[0] + + info = fields[1].split() + ann["type"] = info[0] + ann["ref_id"] = info[1] + + if len(info) > 2: + ann["value"] = info[2] + else: + ann["value"] = "" + + example["attributes"].append(ann) + + elif line.startswith("N"): + ann = {} + fields = line.split("\t") + + ann["id"] = fields[0] + ann["text"] = fields[2] + + info = fields[1].split() + + ann["type"] = info[0] + ann["ref_id"] = info[1] + ann["resource_name"] = info[2].split(":")[0] + ann["cuid"] = info[2].split(":")[1] + example["normalizations"].append(ann) + + elif parse_notes and line.startswith("#"): + ann = {} + fields = line.split("\t") + + ann["id"] = fields[0] + ann["text"] = fields[2] if len(fields) == 3 else BigBioValues.NULL + + info = fields[1].split() + + ann["type"] = info[0] + ann["ref_id"] = info[1] + example["notes"].append(ann) + + return example + + +def brat_parse_to_bigbio_kb(brat_parse: Dict) -> Dict: + """ + Transform a brat parse (conforming to the standard brat schema) obtained with + `parse_brat_file` into a dictionary conforming to the `bigbio-kb` schema (as defined in ../schemas/kb.py) + :param brat_parse: + """ + + unified_example = {} + + # Prefix all ids with document id to ensure global uniqueness, + # because brat ids are only unique within their document + id_prefix = brat_parse["document_id"] + "_" + + # identical + unified_example["document_id"] = brat_parse["document_id"] + unified_example["passages"] = [ + { + "id": id_prefix + "_text", + "type": "abstract", + "text": [brat_parse["text"]], + "offsets": [[0, len(brat_parse["text"])]], + } + ] + + # get normalizations + ref_id_to_normalizations = defaultdict(list) + for normalization in brat_parse["normalizations"]: + ref_id_to_normalizations[normalization["ref_id"]].append( + { + "db_name": normalization["resource_name"], + "db_id": normalization["cuid"], + } + ) + + # separate entities and event triggers + unified_example["events"] = [] + non_event_ann = brat_parse["text_bound_annotations"].copy() + for event in brat_parse["events"]: + event = event.copy() + event["id"] = id_prefix + event["id"] + trigger = next( + tr + for tr in brat_parse["text_bound_annotations"] + if tr["id"] == event["trigger"] + ) + if trigger in non_event_ann: + non_event_ann.remove(trigger) + event["trigger"] = { + "text": trigger["text"].copy(), + "offsets": trigger["offsets"].copy(), + } + for argument in event["arguments"]: + argument["ref_id"] = id_prefix + argument["ref_id"] + + unified_example["events"].append(event) + + unified_example["entities"] = [] + anno_ids = [ref_id["id"] for ref_id in non_event_ann] + for ann in non_event_ann: + entity_ann = ann.copy() + entity_ann["id"] = id_prefix + entity_ann["id"] + entity_ann["normalized"] = ref_id_to_normalizations[ann["id"]] + unified_example["entities"].append(entity_ann) + + # massage relations + unified_example["relations"] = [] + skipped_relations = set() + for ann in brat_parse["relations"]: + if ( + ann["head"]["ref_id"] not in anno_ids + or ann["tail"]["ref_id"] not in anno_ids + ): + skipped_relations.add(ann["id"]) + continue + unified_example["relations"].append( + { + "arg1_id": id_prefix + ann["head"]["ref_id"], + "arg2_id": id_prefix + ann["tail"]["ref_id"], + "id": id_prefix + ann["id"], + "type": ann["type"], + "normalized": [], + } + ) + if len(skipped_relations) > 0: + example_id = brat_parse["document_id"] + logger.info( + f"Example:{example_id}: The `bigbio_kb` schema allows `relations` only between entities." + f" Skip (for now): " + f"{list(skipped_relations)}" + ) + + # get coreferences + unified_example["coreferences"] = [] + for i, ann in enumerate(brat_parse["equivalences"], start=1): + is_entity_cluster = True + for ref_id in ann["ref_ids"]: + if not ref_id.startswith("T"): # not textbound -> no entity + is_entity_cluster = False + elif ref_id not in anno_ids: # event trigger -> no entity + is_entity_cluster = False + if is_entity_cluster: + entity_ids = [id_prefix + i for i in ann["ref_ids"]] + unified_example["coreferences"].append( + {"id": id_prefix + str(i), "entity_ids": entity_ids} + ) + return unified_example diff --git a/bigbio/hub/hub_repos/symptemist/symptemist.py b/bigbio/hub/hub_repos/symptemist/symptemist.py new file mode 100644 index 00000000..70df57ad --- /dev/null +++ b/bigbio/hub/hub_repos/symptemist/symptemist.py @@ -0,0 +1,291 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets +import pandas as pd + +from .bigbiohub import kb_features +from .bigbiohub import BigBioConfig +from .bigbiohub import Tasks + +_LANGUAGES = ["Spanish"] +_PUBMED = False +_LOCAL = False +_CITATION = """\ +@inproceedings{lima2023overview, + title={Overview of SympTEMIST at BioCreative VIII: corpus, guidelines and evaluation of systems for the detection and normalization of symptoms, signs and findings from text}, + author={Lima-L{\'o}pez, Salvador and Farr{\'e}-Maduell, Eul{\`a}lia and Gasco-S{\'a}nchez, Luis and Rodr{\'\i}guez-Miret, Jan and Krallinger, Martin}, + booktitle={Proceedings of the BioCreative VIII Challenge and Workshop: Curation and Evaluation in the era of Generative Models}, + year={2023} +} +""" + +_DATASETNAME = "symptemist" +_DISPLAYNAME = "SympTEMIST" + +_DESCRIPTION = """\ +The SympTEMIST corpus is a collection of 1,000 clinical case reports in Spanish annotated with symptoms, signs and findings mentions and normalized to SNOMED CT. The texts belong to the SPACCC corpus and are the same ones used in SympTEMIST and MedProcNER, making the annotations complementary for medical entity recognition. +""" + +_HOMEPAGE = "https://temu.bsc.es/symptemist/" + +_LICENSE = "CC_BY_4p0" + +_URLS = { + _DATASETNAME: "https://zenodo.org/records/10635215/files/symptemist-complete_240208.zip?download=1", +} + +_SUPPORTED_TASKS = [Tasks.NAMED_ENTITY_RECOGNITION, Tasks.NAMED_ENTITY_DISAMBIGUATION] + +_SOURCE_VERSION = "4.0.0" +_BIGBIO_VERSION = "1.0.0" + + +class SymptemistDataset(datasets.GeneratorBasedBuilder): + """ + The SympTEMIST corpus is a collection of 1,000 clinical case reports in Spanish annotated with symptoms, signs and findings mentions and normalized to SNOMED CT. + """ + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + BIGBIO_VERSION = datasets.Version(_BIGBIO_VERSION) + + BUILDER_CONFIGS = [ + BigBioConfig( + name="symptemist_entities_source", + version=SOURCE_VERSION, + description="SympTEMIST (subtrack 1: entities) source schema", + schema="source", + subset_id="symptemist_entities", + ), + BigBioConfig( + name="symptemist_linking_source", + version=SOURCE_VERSION, + description="SympTEMIST (subtrack 2: linking, original shared task) source schema", + schema="source", + subset_id="symptemist_linking", + ), + BigBioConfig( + name="symptemist_linking_complete_source", + version=SOURCE_VERSION, + description="SympTEMIST (subtrack 2: linking, complete) source schema", + schema="source", + subset_id="symptemist_linking_complete", + ), + BigBioConfig( + name="symptemist_linking_composite_source", + version=SOURCE_VERSION, + description="SympTEMIST (subtrack 2: linking, incl. composite mentions) source schema", + schema="source", + subset_id="symptemist_linking_composite", + ), + BigBioConfig( + name="symptemist_entities_bigbio_kb", + version=BIGBIO_VERSION, + description="SympTEMIST (subtrack 1: entities) BigBio schema", + schema="bigbio_kb", + subset_id="symptemist_entities", + ), + BigBioConfig( + name="symptemist_linking_bigbio_kb", + version=BIGBIO_VERSION, + description="SympTEMIST (subtrack 2: linking, original shared task) BigBio schema", + schema="bigbio_kb", + subset_id="symptemist_linking", + ), + BigBioConfig( + name="symptemist_linking_complete_bigbio_kb", + version=BIGBIO_VERSION, + description="SympTEMIST (subtrack 2: linking, complete) BigBio schema", + schema="bigbio_kb", + subset_id="symptemist_linking_complete", + ), + BigBioConfig( + name="symptemist_linking_composite_bigbio_kb", + version=BIGBIO_VERSION, + description="SympTEMIST (subtrack 2: linking, incl. composite mentions) BigBio schema", + schema="bigbio_kb", + subset_id="symptemist_linking_composite", + ), + ] + + DEFAULT_CONFIG_NAME = "symptemist_entities_source" + + def _info(self) -> datasets.DatasetInfo: + + if self.config.schema == "source": + features = datasets.Features( + { + "id": datasets.Value("string"), + "document_id": datasets.Value("string"), + "passages": [ + { + "id": datasets.Value("string"), + "type": datasets.Value("string"), + "text": datasets.Sequence(datasets.Value("string")), + "offsets": datasets.Sequence([datasets.Value("int32")]), + } + ], + "entities": [ + { + "id": datasets.Value("string"), + "type": datasets.Value("string"), + "text": datasets.Sequence(datasets.Value("string")), + "offsets": datasets.Sequence([datasets.Value("int32")]), + "concept_codes": datasets.Sequence(datasets.Value("string")), + "semantic_relations": datasets.Sequence(datasets.Value("string")), + } + ], + } + ) + elif self.config.schema == "bigbio_kb": + features = kb_features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=str(_LICENSE), + citation=_CITATION, + ) + + def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + urls = _URLS[_DATASETNAME] + data_dir = dl_manager.download_and_extract(urls) + base_bath = Path(data_dir) / "symptemist-complete_240208" + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "split": "train", + "track": self.config.subset_id, + "base_bath": base_bath, + }, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "split": "test", + "track": self.config.subset_id, + "base_bath": base_bath, + }, + ), + ] + + def _generate_examples( + self, + split: str, + track: str, + base_bath: Path, + ) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + + tsv_files = { + ("symptemist_entities", "train"): [ + base_bath / "symptemist_train" / "subtask1-ner" / "tsv" / "symptemist_tsv_train_subtask1.tsv" + ], + ("symptemist_entities", "test"): [ + base_bath / "symptemist_test" / "subtask1-ner" / "tsv" / "symptemist_tsv_test_subtask1.tsv" + ], + ("symptemist_linking", "train"): [ + base_bath / "symptemist_train" / "subtask2-linking" / "symptemist_tsv_train_subtask2.tsv" + ], + ("symptemist_linking", "test"): [ + base_bath / "symptemist_test" / "subtask2-linking" / "symptemist_tsv_test_subtask2.tsv" + ], + ("symptemist_linking_complete", "train"): [ + base_bath / "symptemist_train" / "subtask2-linking" / "symptemist_tsv_train_subtask2_complete.tsv" + ], + ("symptemist_linking_complete", "test"): [ + base_bath / "symptemist_test" / "subtask2-linking" / "symptemist_tsv_test_subtask2.tsv" + ], + ("symptemist_linking_composite", "train"): [ + base_bath + / "symptemist_train" + / "subtask2-linking" + / "symptemist_tsv_train_subtask2_complete+COMPOSITE.tsv" + ], + ("symptemist_linking_composite", "test"): [ + base_bath / "symptemist_test" / "subtask2-linking" / "symptemist_tsv_test_subtask2+COMPOSITE.tsv" + ], + } + + entity_mapping_files = tsv_files[(track, split)] + text_files_dir = base_bath / f"symptemist_{split}" / "subtask1-ner" / "txt" + + # keep this in case more files are added later + entities_mapping = pd.concat([pd.read_csv(file, sep="\t") for file in entity_mapping_files]) + entity_file_names = entities_mapping["filename"].unique() + + for uid, filename in enumerate(entity_file_names): + text_file = text_files_dir / f"{filename}.txt" + + doc_text = text_file.read_text(encoding="utf8") + # doc_text = doc_text.replace("\n", "") + + entities_df: pd.DataFrame = entities_mapping[entities_mapping["filename"] == filename] + + example = { + "id": f"{uid}", + "document_id": filename, + "passages": [ + { + "id": f"{uid}_{filename}_passage", + "type": "clinical_case", + "text": [doc_text], + "offsets": [[0, len(doc_text)]], + } + ], + } + if self.config.schema == "bigbio_kb": + example["events"] = [] + example["coreferences"] = [] + example["relations"] = [] + + entities = [] + for row in entities_df.itertuples(name="Entity"): + + entity = { + "id": f"{uid}_{row.filename}_{row.Index}_entity_id", + "type": row.label, + "text": [row.text], + "offsets": [[row.start_span, row.end_span]] + if self.config.subset_id == "symptemist_entities" + else [[row.span_ini, row.span_end]], + } + + if self.config.schema == "source": + entity["concept_codes"] = [] + entity["semantic_relations"] = [] + if self.config.subset_id == "symptemist_linking": + entity["concept_codes"] = row.code.split("+") + entity["semantic_relations"] = row.sem_rel.split("+") + + elif self.config.schema == "bigbio_kb": + if self.config.subset_id.startswith("symptemist_linking"): + entity["normalized"] = [ + {"db_id": code, "db_name": "SNOMED_CT"} for code in row.code.split("+") + ] + else: + entity["normalized"] = [] + + entities.append(entity) + + example["entities"] = entities + yield uid, example