From c30f83de237ab37631e38bc9e181067295e90979 Mon Sep 17 00:00:00 2001 From: Jeffrey Zhong <20jeffreyzhong02@gmail.com> Date: Thu, 14 Mar 2024 12:42:55 -0700 Subject: [PATCH 1/7] inital working draft of flambe --- bigbio/hub/hub_repos/flambe/README.md | 40 ++ bigbio/hub/hub_repos/flambe/__init__.py | 0 bigbio/hub/hub_repos/flambe/bigbiohub.py | 590 +++++++++++++++++++++++ bigbio/hub/hub_repos/flambe/flambe.py | 317 ++++++++++++ 4 files changed, 947 insertions(+) create mode 100644 bigbio/hub/hub_repos/flambe/README.md create mode 100644 bigbio/hub/hub_repos/flambe/__init__.py create mode 100644 bigbio/hub/hub_repos/flambe/bigbiohub.py create mode 100644 bigbio/hub/hub_repos/flambe/flambe.py diff --git a/bigbio/hub/hub_repos/flambe/README.md b/bigbio/hub/hub_repos/flambe/README.md new file mode 100644 index 000000000..5fec000e9 --- /dev/null +++ b/bigbio/hub/hub_repos/flambe/README.md @@ -0,0 +1,40 @@ +--- +language: + - en [This needs to be a supported huggingface language code] +bigbio_language: + - English +license: apache-2.0 [this shoudl be a supported huggingface license] +bigbio_license_shortname: APACHE_2p0 +multilinguality: monolingual +pretty_name: SciTail +homepage: https://allenai.org/data/scitail +bigbio_pubmed: false +bigbio_public: true +bigbio_tasks: + - TEXTUAL_ENTAILMENT +paperswithcode_id: scitail +--- + + +# Dataset Card for SciTail + +## Dataset Description + +- **Homepage:** https://allenai.org/data/scitail +- **Pubmed:** False +- **Public:** True +- **Tasks:** TE [This needs to be a comma delimitted string of task short names] + + +[This can be equal to the `_DESCRIPTION` attribute of the dataset you are implementing] The SciTail dataset is an entailment dataset created from multiple-choice science exams and web sentences. Each question and the correct answer choice are converted into an assertive statement to form the hypothesis. We use information retrieval to obtain relevant text from a large text corpus of web sentences, and use these sentences as a premise P. We crowd source the annotation of such premise-hypothesis pair as supports (entails) or not (neutral), in order to create the SciTail dataset. The dataset contains 27,026 examples with 10,101 examples with entails label and 16,925 examples with neutral label. + + +## Citation Information + +``` +@inproceedings{scitail, + author = {Tushar Khot and Ashish Sabharwal and Peter Clark}, + booktitle = {AAAI} + title = {SciTail: A Textual Entailment Dataset from Science Question Answering}, + year = {2018} +``` diff --git a/bigbio/hub/hub_repos/flambe/__init__.py b/bigbio/hub/hub_repos/flambe/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/bigbio/hub/hub_repos/flambe/bigbiohub.py b/bigbio/hub/hub_repos/flambe/bigbiohub.py new file mode 100644 index 000000000..f4da7bb78 --- /dev/null +++ b/bigbio/hub/hub_repos/flambe/bigbiohub.py @@ -0,0 +1,590 @@ +from collections import defaultdict +from dataclasses import dataclass +from enum import Enum +import logging +from pathlib import Path +from types import SimpleNamespace +from typing import TYPE_CHECKING, Dict, Iterable, List, Tuple + +import datasets + +if TYPE_CHECKING: + import bioc + +logger = logging.getLogger(__name__) + + +BigBioValues = SimpleNamespace(NULL="") + + +@dataclass +class BigBioConfig(datasets.BuilderConfig): + """BuilderConfig for BigBio.""" + + name: str = None + version: datasets.Version = None + description: str = None + schema: str = None + subset_id: str = None + + +class Tasks(Enum): + NAMED_ENTITY_RECOGNITION = "NER" + NAMED_ENTITY_DISAMBIGUATION = "NED" + EVENT_EXTRACTION = "EE" + RELATION_EXTRACTION = "RE" + COREFERENCE_RESOLUTION = "COREF" + QUESTION_ANSWERING = "QA" + TEXTUAL_ENTAILMENT = "TE" + SEMANTIC_SIMILARITY = "STS" + TEXT_PAIRS_CLASSIFICATION = "TXT2CLASS" + PARAPHRASING = "PARA" + TRANSLATION = "TRANSL" + SUMMARIZATION = "SUM" + TEXT_CLASSIFICATION = "TXTCLASS" + + +entailment_features = datasets.Features( + { + "id": datasets.Value("string"), + "premise": datasets.Value("string"), + "hypothesis": datasets.Value("string"), + "label": datasets.Value("string"), + } +) + +pairs_features = datasets.Features( + { + "id": datasets.Value("string"), + "document_id": datasets.Value("string"), + "text_1": datasets.Value("string"), + "text_2": datasets.Value("string"), + "label": datasets.Value("string"), + } +) + +qa_features = datasets.Features( + { + "id": datasets.Value("string"), + "question_id": datasets.Value("string"), + "document_id": datasets.Value("string"), + "question": datasets.Value("string"), + "type": datasets.Value("string"), + "choices": [datasets.Value("string")], + "context": datasets.Value("string"), + "answer": datasets.Sequence(datasets.Value("string")), + } +) + +text_features = datasets.Features( + { + "id": datasets.Value("string"), + "document_id": datasets.Value("string"), + "text": datasets.Value("string"), + "labels": [datasets.Value("string")], + } +) + +text2text_features = datasets.Features( + { + "id": datasets.Value("string"), + "document_id": datasets.Value("string"), + "text_1": datasets.Value("string"), + "text_2": datasets.Value("string"), + "text_1_name": datasets.Value("string"), + "text_2_name": datasets.Value("string"), + } +) + +kb_features = datasets.Features( + { + "id": datasets.Value("string"), + "document_id": datasets.Value("string"), + "passages": [ + { + "id": datasets.Value("string"), + "type": datasets.Value("string"), + "text": datasets.Sequence(datasets.Value("string")), + "offsets": datasets.Sequence([datasets.Value("int32")]), + } + ], + "entities": [ + { + "id": datasets.Value("string"), + "type": datasets.Value("string"), + "text": datasets.Sequence(datasets.Value("string")), + "offsets": datasets.Sequence([datasets.Value("int32")]), + "normalized": [ + { + "db_name": datasets.Value("string"), + "db_id": datasets.Value("string"), + } + ], + } + ], + "events": [ + { + "id": datasets.Value("string"), + "type": datasets.Value("string"), + # refers to the text_bound_annotation of the trigger + "trigger": { + "text": datasets.Sequence(datasets.Value("string")), + "offsets": datasets.Sequence([datasets.Value("int32")]), + }, + "arguments": [ + { + "role": datasets.Value("string"), + "ref_id": datasets.Value("string"), + } + ], + } + ], + "coreferences": [ + { + "id": datasets.Value("string"), + "entity_ids": datasets.Sequence(datasets.Value("string")), + } + ], + "relations": [ + { + "id": datasets.Value("string"), + "type": datasets.Value("string"), + "arg1_id": datasets.Value("string"), + "arg2_id": datasets.Value("string"), + "normalized": [ + { + "db_name": datasets.Value("string"), + "db_id": datasets.Value("string"), + } + ], + } + ], + } +) + + +TASK_TO_SCHEMA = { + Tasks.NAMED_ENTITY_RECOGNITION.name: "KB", + Tasks.NAMED_ENTITY_DISAMBIGUATION.name: "KB", + Tasks.EVENT_EXTRACTION.name: "KB", + Tasks.RELATION_EXTRACTION.name: "KB", + Tasks.COREFERENCE_RESOLUTION.name: "KB", + Tasks.QUESTION_ANSWERING.name: "QA", + Tasks.TEXTUAL_ENTAILMENT.name: "TE", + Tasks.SEMANTIC_SIMILARITY.name: "PAIRS", + Tasks.TEXT_PAIRS_CLASSIFICATION.name: "PAIRS", + Tasks.PARAPHRASING.name: "T2T", + Tasks.TRANSLATION.name: "T2T", + Tasks.SUMMARIZATION.name: "T2T", + Tasks.TEXT_CLASSIFICATION.name: "TEXT", +} + +SCHEMA_TO_TASKS = defaultdict(set) +for task, schema in TASK_TO_SCHEMA.items(): + SCHEMA_TO_TASKS[schema].add(task) +SCHEMA_TO_TASKS = dict(SCHEMA_TO_TASKS) + +VALID_TASKS = set(TASK_TO_SCHEMA.keys()) +VALID_SCHEMAS = set(TASK_TO_SCHEMA.values()) + +SCHEMA_TO_FEATURES = { + "KB": kb_features, + "QA": qa_features, + "TE": entailment_features, + "T2T": text2text_features, + "TEXT": text_features, + "PAIRS": pairs_features, +} + + +def get_texts_and_offsets_from_bioc_ann(ann: "bioc.BioCAnnotation") -> Tuple: + + offsets = [(loc.offset, loc.offset + loc.length) for loc in ann.locations] + + text = ann.text + + if len(offsets) > 1: + i = 0 + texts = [] + for start, end in offsets: + chunk_len = end - start + texts.append(text[i : chunk_len + i]) + i += chunk_len + while i < len(text) and text[i] == " ": + i += 1 + else: + texts = [text] + + return offsets, texts + + +def remove_prefix(a: str, prefix: str) -> str: + if a.startswith(prefix): + a = a[len(prefix) :] + return a + + +def parse_brat_file( + txt_file: Path, + annotation_file_suffixes: List[str] = None, + parse_notes: bool = False, +) -> Dict: + """ + Parse a brat file into the schema defined below. + `txt_file` should be the path to the brat '.txt' file you want to parse, e.g. 'data/1234.txt' + Assumes that the annotations are contained in one or more of the corresponding '.a1', '.a2' or '.ann' files, + e.g. 'data/1234.ann' or 'data/1234.a1' and 'data/1234.a2'. + Will include annotator notes, when `parse_notes == True`. + brat_features = datasets.Features( + { + "id": datasets.Value("string"), + "document_id": datasets.Value("string"), + "text": datasets.Value("string"), + "text_bound_annotations": [ # T line in brat, e.g. type or event trigger + { + "offsets": datasets.Sequence([datasets.Value("int32")]), + "text": datasets.Sequence(datasets.Value("string")), + "type": datasets.Value("string"), + "id": datasets.Value("string"), + } + ], + "events": [ # E line in brat + { + "trigger": datasets.Value( + "string" + ), # refers to the text_bound_annotation of the trigger, + "id": datasets.Value("string"), + "type": datasets.Value("string"), + "arguments": datasets.Sequence( + { + "role": datasets.Value("string"), + "ref_id": datasets.Value("string"), + } + ), + } + ], + "relations": [ # R line in brat + { + "id": datasets.Value("string"), + "head": { + "ref_id": datasets.Value("string"), + "role": datasets.Value("string"), + }, + "tail": { + "ref_id": datasets.Value("string"), + "role": datasets.Value("string"), + }, + "type": datasets.Value("string"), + } + ], + "equivalences": [ # Equiv line in brat + { + "id": datasets.Value("string"), + "ref_ids": datasets.Sequence(datasets.Value("string")), + } + ], + "attributes": [ # M or A lines in brat + { + "id": datasets.Value("string"), + "type": datasets.Value("string"), + "ref_id": datasets.Value("string"), + "value": datasets.Value("string"), + } + ], + "normalizations": [ # N lines in brat + { + "id": datasets.Value("string"), + "type": datasets.Value("string"), + "ref_id": datasets.Value("string"), + "resource_name": datasets.Value( + "string" + ), # Name of the resource, e.g. "Wikipedia" + "cuid": datasets.Value( + "string" + ), # ID in the resource, e.g. 534366 + "text": datasets.Value( + "string" + ), # Human readable description/name of the entity, e.g. "Barack Obama" + } + ], + ### OPTIONAL: Only included when `parse_notes == True` + "notes": [ # # lines in brat + { + "id": datasets.Value("string"), + "type": datasets.Value("string"), + "ref_id": datasets.Value("string"), + "text": datasets.Value("string"), + } + ], + }, + ) + """ + + example = {} + example["document_id"] = txt_file.with_suffix("").name + with txt_file.open() as f: + example["text"] = f.read() + + # If no specific suffixes of the to-be-read annotation files are given - take standard suffixes + # for event extraction + if annotation_file_suffixes is None: + annotation_file_suffixes = [".a1", ".a2", ".ann"] + + if len(annotation_file_suffixes) == 0: + raise AssertionError( + "At least one suffix for the to-be-read annotation files should be given!" + ) + + ann_lines = [] + for suffix in annotation_file_suffixes: + annotation_file = txt_file.with_suffix(suffix) + if annotation_file.exists(): + with annotation_file.open() as f: + ann_lines.extend(f.readlines()) + + example["text_bound_annotations"] = [] + example["events"] = [] + example["relations"] = [] + example["equivalences"] = [] + example["attributes"] = [] + example["normalizations"] = [] + + if parse_notes: + example["notes"] = [] + + for line in ann_lines: + line = line.strip() + if not line: + continue + + if line.startswith("T"): # Text bound + ann = {} + fields = line.split("\t") + + ann["id"] = fields[0] + ann["type"] = fields[1].split()[0] + ann["offsets"] = [] + span_str = remove_prefix(fields[1], (ann["type"] + " ")) + text = fields[2] + for span in span_str.split(";"): + start, end = span.split() + ann["offsets"].append([int(start), int(end)]) + + # Heuristically split text of discontiguous entities into chunks + ann["text"] = [] + if len(ann["offsets"]) > 1: + i = 0 + for start, end in ann["offsets"]: + chunk_len = end - start + ann["text"].append(text[i : chunk_len + i]) + i += chunk_len + while i < len(text) and text[i] == " ": + i += 1 + else: + ann["text"] = [text] + + example["text_bound_annotations"].append(ann) + + elif line.startswith("E"): + ann = {} + fields = line.split("\t") + + ann["id"] = fields[0] + + ann["type"], ann["trigger"] = fields[1].split()[0].split(":") + + ann["arguments"] = [] + for role_ref_id in fields[1].split()[1:]: + argument = { + "role": (role_ref_id.split(":"))[0], + "ref_id": (role_ref_id.split(":"))[1], + } + ann["arguments"].append(argument) + + example["events"].append(ann) + + elif line.startswith("R"): + ann = {} + fields = line.split("\t") + + ann["id"] = fields[0] + ann["type"] = fields[1].split()[0] + + ann["head"] = { + "role": fields[1].split()[1].split(":")[0], + "ref_id": fields[1].split()[1].split(":")[1], + } + ann["tail"] = { + "role": fields[1].split()[2].split(":")[0], + "ref_id": fields[1].split()[2].split(":")[1], + } + + example["relations"].append(ann) + + # '*' seems to be the legacy way to mark equivalences, + # but I couldn't find any info on the current way + # this might have to be adapted dependent on the brat version + # of the annotation + elif line.startswith("*"): + ann = {} + fields = line.split("\t") + + ann["id"] = fields[0] + ann["ref_ids"] = fields[1].split()[1:] + + example["equivalences"].append(ann) + + elif line.startswith("A") or line.startswith("M"): + ann = {} + fields = line.split("\t") + + ann["id"] = fields[0] + + info = fields[1].split() + ann["type"] = info[0] + ann["ref_id"] = info[1] + + if len(info) > 2: + ann["value"] = info[2] + else: + ann["value"] = "" + + example["attributes"].append(ann) + + elif line.startswith("N"): + ann = {} + fields = line.split("\t") + + ann["id"] = fields[0] + ann["text"] = fields[2] + + info = fields[1].split() + + ann["type"] = info[0] + ann["ref_id"] = info[1] + ann["resource_name"] = info[2].split(":")[0] + ann["cuid"] = info[2].split(":")[1] + example["normalizations"].append(ann) + + elif parse_notes and line.startswith("#"): + ann = {} + fields = line.split("\t") + + ann["id"] = fields[0] + ann["text"] = fields[2] if len(fields) == 3 else BigBioValues.NULL + + info = fields[1].split() + + ann["type"] = info[0] + ann["ref_id"] = info[1] + example["notes"].append(ann) + + return example + + +def brat_parse_to_bigbio_kb(brat_parse: Dict) -> Dict: + """ + Transform a brat parse (conforming to the standard brat schema) obtained with + `parse_brat_file` into a dictionary conforming to the `bigbio-kb` schema (as defined in ../schemas/kb.py) + :param brat_parse: + """ + + unified_example = {} + + # Prefix all ids with document id to ensure global uniqueness, + # because brat ids are only unique within their document + id_prefix = brat_parse["document_id"] + "_" + + # identical + unified_example["document_id"] = brat_parse["document_id"] + unified_example["passages"] = [ + { + "id": id_prefix + "_text", + "type": "abstract", + "text": [brat_parse["text"]], + "offsets": [[0, len(brat_parse["text"])]], + } + ] + + # get normalizations + ref_id_to_normalizations = defaultdict(list) + for normalization in brat_parse["normalizations"]: + ref_id_to_normalizations[normalization["ref_id"]].append( + { + "db_name": normalization["resource_name"], + "db_id": normalization["cuid"], + } + ) + + # separate entities and event triggers + unified_example["events"] = [] + non_event_ann = brat_parse["text_bound_annotations"].copy() + for event in brat_parse["events"]: + event = event.copy() + event["id"] = id_prefix + event["id"] + trigger = next( + tr + for tr in brat_parse["text_bound_annotations"] + if tr["id"] == event["trigger"] + ) + if trigger in non_event_ann: + non_event_ann.remove(trigger) + event["trigger"] = { + "text": trigger["text"].copy(), + "offsets": trigger["offsets"].copy(), + } + for argument in event["arguments"]: + argument["ref_id"] = id_prefix + argument["ref_id"] + + unified_example["events"].append(event) + + unified_example["entities"] = [] + anno_ids = [ref_id["id"] for ref_id in non_event_ann] + for ann in non_event_ann: + entity_ann = ann.copy() + entity_ann["id"] = id_prefix + entity_ann["id"] + entity_ann["normalized"] = ref_id_to_normalizations[ann["id"]] + unified_example["entities"].append(entity_ann) + + # massage relations + unified_example["relations"] = [] + skipped_relations = set() + for ann in brat_parse["relations"]: + if ( + ann["head"]["ref_id"] not in anno_ids + or ann["tail"]["ref_id"] not in anno_ids + ): + skipped_relations.add(ann["id"]) + continue + unified_example["relations"].append( + { + "arg1_id": id_prefix + ann["head"]["ref_id"], + "arg2_id": id_prefix + ann["tail"]["ref_id"], + "id": id_prefix + ann["id"], + "type": ann["type"], + "normalized": [], + } + ) + if len(skipped_relations) > 0: + example_id = brat_parse["document_id"] + logger.info( + f"Example:{example_id}: The `bigbio_kb` schema allows `relations` only between entities." + f" Skip (for now): " + f"{list(skipped_relations)}" + ) + + # get coreferences + unified_example["coreferences"] = [] + for i, ann in enumerate(brat_parse["equivalences"], start=1): + is_entity_cluster = True + for ref_id in ann["ref_ids"]: + if not ref_id.startswith("T"): # not textbound -> no entity + is_entity_cluster = False + elif ref_id not in anno_ids: # event trigger -> no entity + is_entity_cluster = False + if is_entity_cluster: + entity_ids = [id_prefix + i for i in ann["ref_ids"]] + unified_example["coreferences"].append( + {"id": id_prefix + str(i), "entity_ids": entity_ids} + ) + return unified_example diff --git a/bigbio/hub/hub_repos/flambe/flambe.py b/bigbio/hub/hub_repos/flambe/flambe.py new file mode 100644 index 000000000..d8aa7ef62 --- /dev/null +++ b/bigbio/hub/hub_repos/flambe/flambe.py @@ -0,0 +1,317 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This template serves as a starting point for contributing a dataset to the BigScience Biomedical repo. + +When modifying it for your dataset, look for TODO items that offer specific instructions. + +Full documentation on writing dataset loading scripts can be found here: +https://huggingface.co/docs/datasets/add_dataset.html + +To create a dataset loading script you will create a class and implement 3 methods: + * `_info`: Establishes the schema for the dataset, and returns a datasets.DatasetInfo object. + * `_split_generators`: Downloads and extracts data for each split (e.g. train/val/test) or associate local data with each split. + * `_generate_examples`: Creates examples from data on disk that conform to each schema defined in `_info`. + +TODO: Before submitting your script, delete this doc string and replace it with a description of your dataset. + +[bigbio_schema_name] = (kb, pairs, qa, text, t2t, entailment) +""" + +import os +from typing import List, Tuple, Dict +import re + +import datasets +from .bigbiohub import BigBioConfig +from .bigbiohub import Tasks + +# TODO: import the schema that fits your dataset: +from .bigbiohub import kb_features + +# TODO: add True or False boolean value indicating if this dataset is local or not +_LOCAL = False + +# TODO: Add BibTeX citation +_CITATION = """\ +@article{, + author = {}, + title = {}, + journal = {}, + volume = {}, + year = {}, + url = {}, + doi = {}, + biburl = {}, + bibsource = {} +} +""" + +# TODO: create a module level variable with your dataset name (should match script name) +# E.g. Hallmarks of Cancer: flambe --> hallmarks_of_cancer +_DATASETNAME = "flambe" + +# TODO: Add description of the dataset here +# You can copy an official description +_DESCRIPTION = """\ +This dataset is designed for XXX NLP task. +""" + +# TODO: Add a link to an official homepage for the dataset here (if possible) +_HOMEPAGE = "https://github.com/ylaboratory/flambe" + +# TODO: Add the licence for the dataset here (if possible) +# Note that this doesn't have to be a common open source license. +# Some datasets have custom licenses. In this case, simply put the full license terms +# into `_LICENSE` +_LICENSE = "" + +# TODO: Add links to the urls needed to download your dataset files. +# For local datasets, this variable can be an empty dictionary. + +# For publicly available datasets you will most likely end up passing these URLs to dl_manager in _split_generators. +# In most cases the URLs will be the same for the source and bigbio config. +# However, if you need to access different files for each config you can have multiple entries in this dict. +# This can be an arbitrarily nested dict/list of URLs (see below in `_split_generators` method) +_URLS = { + _DATASETNAME: "https://zenodo.org/records/10050681/files/data.zip?download ", +} + +# TODO: add supported task by dataset. One dataset may support multiple tasks +_SUPPORTED_TASKS = [Tasks.NAMED_ENTITY_RECOGNITION] # example: [Tasks.TRANSLATION, Tasks.NAMED_ENTITY_RECOGNITION, Tasks.RELATION_EXTRACTION] + +# TODO: set this to a version that is associated with the dataset. if none exists use "1.0.0" +# This version doesn't have to be consistent with semantic versioning. Anything that is +# provided by the original dataset as a version goes. +_SOURCE_VERSION = "1.0.0" + +_BIGBIO_VERSION = "1.0.0" + + +# TODO: Name the dataset class to match the script name using CamelCase instead of snake_case +# Append "Dataset" to the class name: BioASQ --> BioasqDataset +class NewDataset(datasets.GeneratorBasedBuilder): + """TODO: Short description of my dataset.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + BIGBIO_VERSION = datasets.Version(_BIGBIO_VERSION) + + # You will be able to load the "source" or "bigbio" configurations with + # ds_source = datasets.load_dataset('my_dataset', name='source') + # ds_bigbio = datasets.load_dataset('my_dataset', name='bigbio') + + # For local datasets you can make use of the `data_dir` and `data_files` kwargs + # https://huggingface.co/docs/datasets/add_dataset.html#downloading-data-files-and-organizing-splits + # ds_source = datasets.load_dataset('my_dataset', name='source', data_dir="/path/to/data/files") + # ds_bigbio = datasets.load_dataset('my_dataset', name='bigbio', data_dir="/path/to/data/files") + + # TODO: For each dataset, implement Config for Source and BigBio; + # If dataset contains more than one subset (see examples/bioasq.py) implement for EACH of them. + # Each of them should contain: + # - name: should be unique for each dataset config eg. bioasq10b_(source|bigbio)_[bigbio_schema_name] + # - version: option = (SOURCE_VERSION|BIGBIO_VERSION) + # - description: one line description for the dataset + # - schema: options = (source|bigbio_[bigbio_schema_name]) + # - subset_id: subset id is the canonical name for the dataset (eg. bioasq10b) + # where [bigbio_schema_name] = (kb, pairs, qa, text, t2t, entailment) + + BUILDER_CONFIGS = [ + BigBioConfig( + name="fulltext_tools", + version=SOURCE_VERSION, + description="...", + schema="source", + subset_id="fulltext_tools", + ), + BigBioConfig( + name="fulltext_tissues", + version=SOURCE_VERSION, + description="fulltext_tissues", + schema="source", + subset_id="fulltext_tissues", + ), + BigBioConfig( + name="abstract_tissues", + version=SOURCE_VERSION, + description="fulltext_tissues", + schema="source", + subset_id="abstract_tissues", + ), + ] + + DEFAULT_CONFIG_NAME = "fulltext_tools" + + def _info(self) -> datasets.DatasetInfo: + + # Create the source schema; this schema will keep all keys/information/labels as close to the original dataset as possible. + + # You can arbitrarily nest lists and dictionaries. + # For iterables, use lists over tuples or `datasets.Sequence` + + if self.config.schema == "source": + # TODO: Create your source schema here + #raise NotImplementedError() + + # EX: Arbitrary NER type dataset + features = datasets.Features( + { + "id": datasets.Value("string"), + "tokens": datasets.Sequence(datasets.Value("string")), + "tags": datasets.Sequence(datasets.Value("string")), + } + ) + + # Choose the appropriate bigbio schema for your task and copy it here. You can find information on the schemas in the CONTRIBUTING guide. + + # In rare cases you may get a dataset that supports multiple tasks requiring multiple schemas. In that case you can define multiple bigbio configs with a bigbio_[bigbio_schema_name] format. + + # For example bigbio_kb, bigbio_t2t + elif self.config.schema == "bigbio_kb": + features = kb_features + # TODO: Choose your big-bio schema here + #raise NotImplementedError() + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + # TODO: This method is tasked with downloading/extracting the data and defining the splits depending on the configuration + + # If you need to access the "source" or "bigbio" config choice, that will be in self.config.name + + # LOCAL DATASETS: You do not need the dl_manager; you can ignore this argument. Make sure `gen_kwargs` in the return gets passed the right filepath + + # PUBLIC DATASETS: Assign your data-dir based on the dl_manager. + + # dl_manager is a datasets.download.DownloadManager that can be used to download and extract URLs; many examples use the download_and_extract method; see the DownloadManager docs here: https://huggingface.co/docs/datasets/package_reference/builder_classes.html#datasets.DownloadManager + + # dl_manager can accept any type of nested list/dict and will give back the same structure with the url replaced with the path to local files. + + # TODO: KEEP if your dataset is PUBLIC; remove if not + urls = _URLS[_DATASETNAME] + data_dir = dl_manager.download_and_extract(urls) + + # TODO: KEEP if your dataset is LOCAL; remove if NOT + # if self.config.data_dir is None: + # raise ValueError("This is a local dataset. Please pass the data_dir kwarg to load_dataset.") + # else: + # data_dir = self.config.data_dir + + # Not all datasets have predefined canonical train/val/test splits. + # If your dataset has no predefined splits, use datasets.Split.TRAIN for all of the data. + + path = { + "fulltext_tools": { + "train": os.path.join(data_dir, "data/tags/fulltext_iob/fulltext_tools_train.iob"), + "test": os.path.join(data_dir, "data/tags/fulltext_iob/fulltext_tools_test.iob"), + "dev": os.path.join(data_dir, "data/tags/fulltext_iob/fulltext_tools_validation.iob"), + }, + "fulltext_tissues": { + "train": os.path.join(data_dir, "data/tags/fulltext_iob/fulltext_tissues_train.iob"), + "test": os.path.join(data_dir, "data/tags/fulltext_iob/fulltext_tissues_test.iob"), + "dev": os.path.join(data_dir, "data/tags/fulltext_iob/fulltext_tissues_validation.iob"), + }, + "abstract_tissues": { + "train": os.path.join(data_dir, "data/tags/abstract_iob/abstract_tissues_train.iob"), + "test": os.path.join(data_dir, "data/tags/abstract_iob/abstract_tissues_test.iob"), + "dev": os.path.join(data_dir, "data/tags/abstract_iob/abstract_tissues_validation.iob"), + }, + } + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + # Whatever you put in gen_kwargs will be passed to _generate_examples + gen_kwargs={ + "filepath": path[self.config.name]["train"], + "split": "train", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "filepath": path[self.config.name]["test"], + "split": "test", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={ + "filepath": path[self.config.name]["dev"], + "split": "dev", + }, + ), + ] + + # method parameters are unpacked from `gen_kwargs` as given in `_split_generators` + + # TODO: change the args of this function to match the keys in `gen_kwargs`. You may add any necessary kwargs. + + + def _generate_examples(self, filepath, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + # TODO: This method handles input defined in _split_generators to yield (key, example) tuples from the dataset. + + # The `key` is for legacy reasons (tfds) and is not important in itself, but must be unique for each example. + + # NOTE: For local datasets you will have access to self.config.data_dir and self.config.data_files + + with open(filepath, "r") as f: + id_value = None + tokens = [] + tags = [] + key = 0 + for line in f: + line = line.strip() + if line: + parts = line.split() + if parts[1] == "begin": + if id_value is not None: + yield key, {"id": id_value, "tokens": tokens, "tags": tags} + key += 1 + tokens = [] + tags = [] + id_value = parts[0] + elif parts[1] == "end": + yield key, {"id": id_value, "tokens": tokens, "tags": tags} + key += 1 + id_value = None + tokens = [] + tags = [] + else: + tokens.append(parts[0]) + tags.append(parts[1]) + if id_value is not None: + yield key, {"id": id_value, "tokens": tokens, "tags": tags} + key += 1 + + + + +# This template is based on the following template from the datasets package: +# https://github.com/huggingface/datasets/blob/master/templates/new_dataset_script.py + + +# This allows you to run your dataloader with `python flambe.py` during development +# TODO: Remove this before making your PR +if __name__ == "__main__": + datasets.load_dataset(__file__) From 1d6cbb633f161bcb10f398709fa8b0d40785659c Mon Sep 17 00:00:00 2001 From: Jeffrey Zhong <20jeffreyzhong02@gmail.com> Date: Tue, 21 May 2024 13:28:29 -0700 Subject: [PATCH 2/7] added ned data --- bigbio/hub/hub_repos/flambe/README.md | 29 ++--- bigbio/hub/hub_repos/flambe/flambe.py | 155 +++++++++++++++++++------- 2 files changed, 128 insertions(+), 56 deletions(-) diff --git a/bigbio/hub/hub_repos/flambe/README.md b/bigbio/hub/hub_repos/flambe/README.md index 5fec000e9..1a658d553 100644 --- a/bigbio/hub/hub_repos/flambe/README.md +++ b/bigbio/hub/hub_repos/flambe/README.md @@ -1,18 +1,18 @@ --- language: - - en [This needs to be a supported huggingface language code] + - en bigbio_language: - English -license: apache-2.0 [this shoudl be a supported huggingface license] +license: cc-by-4.0 bigbio_license_shortname: APACHE_2p0 multilinguality: monolingual -pretty_name: SciTail -homepage: https://allenai.org/data/scitail +pretty_name: FlaMBe +homepage: https://github.com/ylaboratory/flambe bigbio_pubmed: false bigbio_public: true bigbio_tasks: - - TEXTUAL_ENTAILMENT -paperswithcode_id: scitail + - NAMED_ENTITY_RECOGNITION + - NAMED_ENTITY_DISAMBIGUATION --- @@ -20,21 +20,24 @@ paperswithcode_id: scitail ## Dataset Description -- **Homepage:** https://allenai.org/data/scitail +- **Homepage:** https://github.com/ylaboratory/flambe - **Pubmed:** False - **Public:** True - **Tasks:** TE [This needs to be a comma delimitted string of task short names] -[This can be equal to the `_DESCRIPTION` attribute of the dataset you are implementing] The SciTail dataset is an entailment dataset created from multiple-choice science exams and web sentences. Each question and the correct answer choice are converted into an assertive statement to form the hypothesis. We use information retrieval to obtain relevant text from a large text corpus of web sentences, and use these sentences as a premise P. We crowd source the annotation of such premise-hypothesis pair as supports (entails) or not (neutral), in order to create the SciTail dataset. The dataset contains 27,026 examples with 10,101 examples with entails label and 16,925 examples with neutral label. +FlaMBe is a dataset aimed at procedural knowledge extraction from biomedical texts, particularly focusing on single cell research methodologies described in academic papers. It includes annotations from 55 full-text articles and 1,195 abstracts, covering nearly 710,000 tokens, and is distinguished by its comprehensive named entity recognition (NER) and disambiguation (NED) for tissue/cell types, software tools, and computational methods. This dataset, to our knowledge, is the largest of its kind for tissue/cell types, links entities to identifiers in relevant knowledge bases and annotates nearly 400 workflow relations between tool-context pairs. ## Citation Information ``` -@inproceedings{scitail, - author = {Tushar Khot and Ashish Sabharwal and Peter Clark}, - booktitle = {AAAI} - title = {SciTail: A Textual Entailment Dataset from Science Question Answering}, - year = {2018} +@inproceedings{, + author = {Dannenfelser, Ruth and Zhong, Jeffrey and Zhang, Ran and Yao, Vicky}, + title = {Into the Single Cell Multiverse: an End-to-End Dataset for Procedural Knowledge Extraction in Biomedical Texts}, + publisher = {Advances in Neural Information Processing Systems}, + volume = {36}, + year = {2024}, + url = {https://proceedings.neurips.cc/paper_files/paper/2023/file/23e3d86c9a19d0caf2ec997e73dfcfbd-Paper-Datasets_and_Benchmarks.pdf}, +} ``` diff --git a/bigbio/hub/hub_repos/flambe/flambe.py b/bigbio/hub/hub_repos/flambe/flambe.py index d8aa7ef62..0c4e243c2 100644 --- a/bigbio/hub/hub_repos/flambe/flambe.py +++ b/bigbio/hub/hub_repos/flambe/flambe.py @@ -47,16 +47,13 @@ # TODO: Add BibTeX citation _CITATION = """\ -@article{, - author = {}, - title = {}, - journal = {}, - volume = {}, - year = {}, - url = {}, - doi = {}, - biburl = {}, - bibsource = {} +@inproceedings{, + author = {Dannenfelser, Ruth and Zhong, Jeffrey and Zhang, Ran and Yao, Vicky}, + title = {Into the Single Cell Multiverse: an End-to-End Dataset for Procedural Knowledge Extraction in Biomedical Texts}, + publisher = {Advances in Neural Information Processing Systems}, + volume = {36}, + year = {2024}, + url = {https://proceedings.neurips.cc/paper_files/paper/2023/file/23e3d86c9a19d0caf2ec997e73dfcfbd-Paper-Datasets_and_Benchmarks.pdf}, } """ @@ -67,7 +64,13 @@ # TODO: Add description of the dataset here # You can copy an official description _DESCRIPTION = """\ -This dataset is designed for XXX NLP task. +FlaMBe is a dataset aimed at procedural knowledge extraction from biomedical texts, +particularly focusing on single cell research methodologies described in academic papers. It includes +annotations from 55 full-text articles and 1,195 abstracts, covering nearly 710,000 tokens, and is +distinguished by its comprehensive named entity recognition (NER) and disambiguation (NED) for +tissue/cell types, software tools, and computational methods. This dataset, to our knowledge, is +the largest of its kind for tissue/cell types, links entities to identifiers in relevant knowledge +bases and annotates nearly 400 workflow relations between tool-context pairs. """ # TODO: Add a link to an official homepage for the dataset here (if possible) @@ -77,7 +80,7 @@ # Note that this doesn't have to be a common open source license. # Some datasets have custom licenses. In this case, simply put the full license terms # into `_LICENSE` -_LICENSE = "" +_LICENSE = "Creative Commons Attribution 4.0 International" # TODO: Add links to the urls needed to download your dataset files. # For local datasets, this variable can be an empty dictionary. @@ -87,17 +90,25 @@ # However, if you need to access different files for each config you can have multiple entries in this dict. # This can be an arbitrarily nested dict/list of URLs (see below in `_split_generators` method) _URLS = { - _DATASETNAME: "https://zenodo.org/records/10050681/files/data.zip?download ", + _DATASETNAME: "https://zenodo.org/records/10050681/files/data.zip?download", + "ned": {"tissue_test": "https://zenodo.org/records/11218662/files/tissue_ned_test.csv?download", + "tissue_train": "https://zenodo.org/records/11218662/files/tissue_ned_train.csv?download", + "tissue_val": "https://zenodo.org/records/11218662/files/tissue_ned_val.csv?download", + "tool_test": "https://zenodo.org/records/11218662/files/tool_ned_test.csv?download", + "tool_train": "https://zenodo.org/records/11218662/files/tool_ned_train.csv?download", + "tool_val" : "https://zenodo.org/records/11218662/files/tool_ned_val.csv?download" + }, } # TODO: add supported task by dataset. One dataset may support multiple tasks -_SUPPORTED_TASKS = [Tasks.NAMED_ENTITY_RECOGNITION] # example: [Tasks.TRANSLATION, Tasks.NAMED_ENTITY_RECOGNITION, Tasks.RELATION_EXTRACTION] +_SUPPORTED_TASKS = [Tasks.NAMED_ENTITY_RECOGNITION, + Tasks.NAMED_ENTITY_DISAMBIGUATION, + ] # example: [Tasks.TRANSLATION, Tasks.NAMED_ENTITY_RECOGNITION, Tasks.RELATION_EXTRACTION] # TODO: set this to a version that is associated with the dataset. if none exists use "1.0.0" # This version doesn't have to be consistent with semantic versioning. Anything that is # provided by the original dataset as a version goes. _SOURCE_VERSION = "1.0.0" - _BIGBIO_VERSION = "1.0.0" @@ -132,7 +143,7 @@ class NewDataset(datasets.GeneratorBasedBuilder): BigBioConfig( name="fulltext_tools", version=SOURCE_VERSION, - description="...", + description="fulltext_tools", schema="source", subset_id="fulltext_tools", ), @@ -146,10 +157,24 @@ class NewDataset(datasets.GeneratorBasedBuilder): BigBioConfig( name="abstract_tissues", version=SOURCE_VERSION, - description="fulltext_tissues", + description="abstract_tissues", schema="source", subset_id="abstract_tissues", ), + BigBioConfig( + name="ned_tissues", + version=SOURCE_VERSION, + description="ned_fulltext_tissues", + schema="source_ned_tissue", + subset_id="ned_tissues", + ), + BigBioConfig( + name="ned_tools", + version=SOURCE_VERSION, + description="ned_fulltext_tools", + schema="source_ned_tool", + subset_id="ned_tools", + ), ] DEFAULT_CONFIG_NAME = "fulltext_tools" @@ -173,6 +198,24 @@ def _info(self) -> datasets.DatasetInfo: "tags": datasets.Sequence(datasets.Value("string")), } ) + + elif self.config.schema == "source_ned_tissue": + features = datasets.Features( + { + "orginal_text": datasets.Value("string"), + "mapped_NCIT": datasets.Value("string"), + "NCIT_name": datasets.Value("string"), + } + ) + + elif self.config.schema == "source_ned_tool": + features = datasets.Features( + { + "orginal_text": datasets.Value("string"), + "standardized_name": datasets.Value("string"), + "url": datasets.Value("string"), + } + ) # Choose the appropriate bigbio schema for your task and copy it here. You can find information on the schemas in the CONTRIBUTING guide. @@ -235,6 +278,17 @@ def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: "test": os.path.join(data_dir, "data/tags/abstract_iob/abstract_tissues_test.iob"), "dev": os.path.join(data_dir, "data/tags/abstract_iob/abstract_tissues_validation.iob"), }, + "ned_tissues" : { + "train": dl_manager.download_and_extract(_URLS["ned"]["tissue_train"]), + "test": dl_manager.download_and_extract(_URLS["ned"]["tissue_test"]), + "dev": dl_manager.download_and_extract(_URLS["ned"]["tissue_val"]), + + }, + "ned_tools" : { + "train": dl_manager.download_and_extract(_URLS["ned"]["tool_train"]), + "test": dl_manager.download_and_extract(_URLS["ned"]["tool_test"]), + "dev": dl_manager.download_and_extract(_URLS["ned"]["tool_val"]), + } } return [ @@ -275,36 +329,51 @@ def _generate_examples(self, filepath, split: str) -> Tuple[int, Dict]: # NOTE: For local datasets you will have access to self.config.data_dir and self.config.data_files - with open(filepath, "r") as f: - id_value = None - tokens = [] - tags = [] - key = 0 - for line in f: - line = line.strip() - if line: - parts = line.split() - if parts[1] == "begin": - if id_value is not None: + if self.config.schema == 'source': + with open(filepath, "r") as f: + id_value = None + tokens = [] + tags = [] + key = 0 + for line in f: + line = line.strip() + if line: + parts = line.split() + if parts[1] == "begin": + if id_value is not None: + yield key, {"id": id_value, "tokens": tokens, "tags": tags} + key += 1 + tokens = [] + tags = [] + id_value = parts[0] + elif parts[1] == "end": yield key, {"id": id_value, "tokens": tokens, "tags": tags} key += 1 + id_value = None tokens = [] tags = [] - id_value = parts[0] - elif parts[1] == "end": - yield key, {"id": id_value, "tokens": tokens, "tags": tags} - key += 1 - id_value = None - tokens = [] - tags = [] - else: - tokens.append(parts[0]) - tags.append(parts[1]) - if id_value is not None: - yield key, {"id": id_value, "tokens": tokens, "tags": tags} - key += 1 - - + else: + tokens.append(parts[0]) + tags.append(parts[1]) + if id_value is not None: + yield key, {"id": id_value, "tokens": tokens, "tags": tags} + key += 1 + + elif self.config.schema == "source_ned_tissue": + key = 0 + for line in open(filepath): + csv_row = line.strip('\n').split(",") + if csv_row is not None: + yield key, { "orginal_text": csv_row[0], "mapped_NCIT": csv_row[1], "NCIT_name": csv_row[2]} + key += 1 + + elif self.config.schema == "source_ned_tool": + key = 0 + for line in open(filepath): + csv_row = line.strip('\n').split(",") + if csv_row is not None: + yield key, { "orginal_text": csv_row[0], "standardized_name": csv_row[1], "url": csv_row[2]} + key += 1 # This template is based on the following template from the datasets package: From bd7a8f79555eab12ffbf089e171b589880c2a9aa Mon Sep 17 00:00:00 2001 From: Jeffrey Zhong <20jeffreyzhong02@gmail.com> Date: Mon, 3 Jun 2024 23:47:11 -0700 Subject: [PATCH 3/7] changed comments --- bigbio/hub/hub_repos/flambe/README.md | 4 +- bigbio/hub/hub_repos/flambe/flambe.py | 163 +++++--------------------- 2 files changed, 30 insertions(+), 137 deletions(-) diff --git a/bigbio/hub/hub_repos/flambe/README.md b/bigbio/hub/hub_repos/flambe/README.md index 1a658d553..53a4d1a6d 100644 --- a/bigbio/hub/hub_repos/flambe/README.md +++ b/bigbio/hub/hub_repos/flambe/README.md @@ -16,14 +16,14 @@ bigbio_tasks: --- -# Dataset Card for SciTail +# Dataset Card for Flambe ## Dataset Description - **Homepage:** https://github.com/ylaboratory/flambe - **Pubmed:** False - **Public:** True -- **Tasks:** TE [This needs to be a comma delimitted string of task short names] +- **Tasks:** NER,NED FlaMBe is a dataset aimed at procedural knowledge extraction from biomedical texts, particularly focusing on single cell research methodologies described in academic papers. It includes annotations from 55 full-text articles and 1,195 abstracts, covering nearly 710,000 tokens, and is distinguished by its comprehensive named entity recognition (NER) and disambiguation (NED) for tissue/cell types, software tools, and computational methods. This dataset, to our knowledge, is the largest of its kind for tissue/cell types, links entities to identifiers in relevant knowledge bases and annotates nearly 400 workflow relations between tool-context pairs. diff --git a/bigbio/hub/hub_repos/flambe/flambe.py b/bigbio/hub/hub_repos/flambe/flambe.py index 0c4e243c2..f854cb268 100644 --- a/bigbio/hub/hub_repos/flambe/flambe.py +++ b/bigbio/hub/hub_repos/flambe/flambe.py @@ -13,23 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" -This template serves as a starting point for contributing a dataset to the BigScience Biomedical repo. - -When modifying it for your dataset, look for TODO items that offer specific instructions. - -Full documentation on writing dataset loading scripts can be found here: -https://huggingface.co/docs/datasets/add_dataset.html - -To create a dataset loading script you will create a class and implement 3 methods: - * `_info`: Establishes the schema for the dataset, and returns a datasets.DatasetInfo object. - * `_split_generators`: Downloads and extracts data for each split (e.g. train/val/test) or associate local data with each split. - * `_generate_examples`: Creates examples from data on disk that conform to each schema defined in `_info`. - -TODO: Before submitting your script, delete this doc string and replace it with a description of your dataset. - -[bigbio_schema_name] = (kb, pairs, qa, text, t2t, entailment) -""" import os from typing import List, Tuple, Dict @@ -39,13 +22,10 @@ from .bigbiohub import BigBioConfig from .bigbiohub import Tasks -# TODO: import the schema that fits your dataset: from .bigbiohub import kb_features -# TODO: add True or False boolean value indicating if this dataset is local or not _LOCAL = False -# TODO: Add BibTeX citation _CITATION = """\ @inproceedings{, author = {Dannenfelser, Ruth and Zhong, Jeffrey and Zhang, Ran and Yao, Vicky}, @@ -57,12 +37,9 @@ } """ -# TODO: create a module level variable with your dataset name (should match script name) -# E.g. Hallmarks of Cancer: flambe --> hallmarks_of_cancer _DATASETNAME = "flambe" +_DISPLAYNAME = "Flambe" -# TODO: Add description of the dataset here -# You can copy an official description _DESCRIPTION = """\ FlaMBe is a dataset aimed at procedural knowledge extraction from biomedical texts, particularly focusing on single cell research methodologies described in academic papers. It includes @@ -73,22 +50,10 @@ bases and annotates nearly 400 workflow relations between tool-context pairs. """ -# TODO: Add a link to an official homepage for the dataset here (if possible) _HOMEPAGE = "https://github.com/ylaboratory/flambe" -# TODO: Add the licence for the dataset here (if possible) -# Note that this doesn't have to be a common open source license. -# Some datasets have custom licenses. In this case, simply put the full license terms -# into `_LICENSE` _LICENSE = "Creative Commons Attribution 4.0 International" -# TODO: Add links to the urls needed to download your dataset files. -# For local datasets, this variable can be an empty dictionary. - -# For publicly available datasets you will most likely end up passing these URLs to dl_manager in _split_generators. -# In most cases the URLs will be the same for the source and bigbio config. -# However, if you need to access different files for each config you can have multiple entries in this dict. -# This can be an arbitrarily nested dict/list of URLs (see below in `_split_generators` method) _URLS = { _DATASETNAME: "https://zenodo.org/records/10050681/files/data.zip?download", "ned": {"tissue_test": "https://zenodo.org/records/11218662/files/tissue_ned_test.csv?download", @@ -100,97 +65,65 @@ }, } -# TODO: add supported task by dataset. One dataset may support multiple tasks _SUPPORTED_TASKS = [Tasks.NAMED_ENTITY_RECOGNITION, Tasks.NAMED_ENTITY_DISAMBIGUATION, - ] # example: [Tasks.TRANSLATION, Tasks.NAMED_ENTITY_RECOGNITION, Tasks.RELATION_EXTRACTION] + ] -# TODO: set this to a version that is associated with the dataset. if none exists use "1.0.0" -# This version doesn't have to be consistent with semantic versioning. Anything that is -# provided by the original dataset as a version goes. _SOURCE_VERSION = "1.0.0" _BIGBIO_VERSION = "1.0.0" -# TODO: Name the dataset class to match the script name using CamelCase instead of snake_case -# Append "Dataset" to the class name: BioASQ --> BioasqDataset -class NewDataset(datasets.GeneratorBasedBuilder): +class FlambeDataset(datasets.GeneratorBasedBuilder): """TODO: Short description of my dataset.""" SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) BIGBIO_VERSION = datasets.Version(_BIGBIO_VERSION) - # You will be able to load the "source" or "bigbio" configurations with - # ds_source = datasets.load_dataset('my_dataset', name='source') - # ds_bigbio = datasets.load_dataset('my_dataset', name='bigbio') - - # For local datasets you can make use of the `data_dir` and `data_files` kwargs - # https://huggingface.co/docs/datasets/add_dataset.html#downloading-data-files-and-organizing-splits - # ds_source = datasets.load_dataset('my_dataset', name='source', data_dir="/path/to/data/files") - # ds_bigbio = datasets.load_dataset('my_dataset', name='bigbio', data_dir="/path/to/data/files") - - # TODO: For each dataset, implement Config for Source and BigBio; - # If dataset contains more than one subset (see examples/bioasq.py) implement for EACH of them. - # Each of them should contain: - # - name: should be unique for each dataset config eg. bioasq10b_(source|bigbio)_[bigbio_schema_name] - # - version: option = (SOURCE_VERSION|BIGBIO_VERSION) - # - description: one line description for the dataset - # - schema: options = (source|bigbio_[bigbio_schema_name]) - # - subset_id: subset id is the canonical name for the dataset (eg. bioasq10b) - # where [bigbio_schema_name] = (kb, pairs, qa, text, t2t, entailment) - BUILDER_CONFIGS = [ BigBioConfig( - name="fulltext_tools", + name="flambe_ner_fulltext_tools_source", version=SOURCE_VERSION, - description="fulltext_tools", + description="NER dataset for tools from full papers", schema="source", - subset_id="fulltext_tools", + subset_id="flambe_ner_fulltext_tools_source", ), BigBioConfig( - name="fulltext_tissues", + name="flambe_ner_fulltext_tissues_source", version=SOURCE_VERSION, - description="fulltext_tissues", + description="NER dataset for tissues from full papers", schema="source", - subset_id="fulltext_tissues", + subset_id="flambe_ner_fulltext_tissues_source", ), BigBioConfig( - name="abstract_tissues", + name="flambe_ner_abstract_tissues_source", version=SOURCE_VERSION, - description="abstract_tissues", + description="NER dataset for tissues from abstracts", schema="source", - subset_id="abstract_tissues", + subset_id="flambe_ner_abstract_tissues_source", ), BigBioConfig( - name="ned_tissues", + name="flambe_ned_tissues", version=SOURCE_VERSION, - description="ned_fulltext_tissues", + description="NED dataset for tissues from full papers", schema="source_ned_tissue", - subset_id="ned_tissues", + subset_id="flambe_ned_tissues", ), BigBioConfig( - name="ned_tools", + name="flambe_ned_tools", version=SOURCE_VERSION, - description="ned_fulltext_tools", + description="NED dataset for tools from full papers", schema="source_ned_tool", - subset_id="ned_tools", + subset_id="flambe_ned_tools", ), ] - DEFAULT_CONFIG_NAME = "fulltext_tools" + DEFAULT_CONFIG_NAME = "flambe_ner_fulltext_tools_source" def _info(self) -> datasets.DatasetInfo: - # Create the source schema; this schema will keep all keys/information/labels as close to the original dataset as possible. - - # You can arbitrarily nest lists and dictionaries. - # For iterables, use lists over tuples or `datasets.Sequence` if self.config.schema == "source": - # TODO: Create your source schema here - #raise NotImplementedError() - # EX: Arbitrary NER type dataset features = datasets.Features( { "id": datasets.Value("string"), @@ -217,16 +150,9 @@ def _info(self) -> datasets.DatasetInfo: } ) - # Choose the appropriate bigbio schema for your task and copy it here. You can find information on the schemas in the CONTRIBUTING guide. - - # In rare cases you may get a dataset that supports multiple tasks requiring multiple schemas. In that case you can define multiple bigbio configs with a bigbio_[bigbio_schema_name] format. - - # For example bigbio_kb, bigbio_t2t elif self.config.schema == "bigbio_kb": features = kb_features - # TODO: Choose your big-bio schema here - #raise NotImplementedError() - + return datasets.DatasetInfo( description=_DESCRIPTION, features=features, @@ -237,54 +163,37 @@ def _info(self) -> datasets.DatasetInfo: def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: """Returns SplitGenerators.""" - # TODO: This method is tasked with downloading/extracting the data and defining the splits depending on the configuration - - # If you need to access the "source" or "bigbio" config choice, that will be in self.config.name - # LOCAL DATASETS: You do not need the dl_manager; you can ignore this argument. Make sure `gen_kwargs` in the return gets passed the right filepath - - # PUBLIC DATASETS: Assign your data-dir based on the dl_manager. - - # dl_manager is a datasets.download.DownloadManager that can be used to download and extract URLs; many examples use the download_and_extract method; see the DownloadManager docs here: https://huggingface.co/docs/datasets/package_reference/builder_classes.html#datasets.DownloadManager - - # dl_manager can accept any type of nested list/dict and will give back the same structure with the url replaced with the path to local files. + # TODO: KEEP if your dataset is PUBLIC; remove if not + # TODO: KEEP if your dataset is PUBLIC; remove if not urls = _URLS[_DATASETNAME] data_dir = dl_manager.download_and_extract(urls) - # TODO: KEEP if your dataset is LOCAL; remove if NOT - # if self.config.data_dir is None: - # raise ValueError("This is a local dataset. Please pass the data_dir kwarg to load_dataset.") - # else: - # data_dir = self.config.data_dir - - # Not all datasets have predefined canonical train/val/test splits. - # If your dataset has no predefined splits, use datasets.Split.TRAIN for all of the data. - path = { - "fulltext_tools": { + "flambe_ner_fulltext_tools_source": { "train": os.path.join(data_dir, "data/tags/fulltext_iob/fulltext_tools_train.iob"), "test": os.path.join(data_dir, "data/tags/fulltext_iob/fulltext_tools_test.iob"), "dev": os.path.join(data_dir, "data/tags/fulltext_iob/fulltext_tools_validation.iob"), }, - "fulltext_tissues": { + "flambe_ner_fulltext_tissues_source": { "train": os.path.join(data_dir, "data/tags/fulltext_iob/fulltext_tissues_train.iob"), "test": os.path.join(data_dir, "data/tags/fulltext_iob/fulltext_tissues_test.iob"), "dev": os.path.join(data_dir, "data/tags/fulltext_iob/fulltext_tissues_validation.iob"), }, - "abstract_tissues": { + "flambe_ner_abstract_tissues_source": { "train": os.path.join(data_dir, "data/tags/abstract_iob/abstract_tissues_train.iob"), "test": os.path.join(data_dir, "data/tags/abstract_iob/abstract_tissues_test.iob"), "dev": os.path.join(data_dir, "data/tags/abstract_iob/abstract_tissues_validation.iob"), }, - "ned_tissues" : { + "flambe_ned_tissues" : { "train": dl_manager.download_and_extract(_URLS["ned"]["tissue_train"]), "test": dl_manager.download_and_extract(_URLS["ned"]["tissue_test"]), "dev": dl_manager.download_and_extract(_URLS["ned"]["tissue_val"]), }, - "ned_tools" : { + "flambe_ned_tools" : { "train": dl_manager.download_and_extract(_URLS["ned"]["tool_train"]), "test": dl_manager.download_and_extract(_URLS["ned"]["tool_test"]), "dev": dl_manager.download_and_extract(_URLS["ned"]["tool_val"]), @@ -294,7 +203,6 @@ def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, - # Whatever you put in gen_kwargs will be passed to _generate_examples gen_kwargs={ "filepath": path[self.config.name]["train"], "split": "train", @@ -316,19 +224,10 @@ def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: ), ] - # method parameters are unpacked from `gen_kwargs` as given in `_split_generators` - - # TODO: change the args of this function to match the keys in `gen_kwargs`. You may add any necessary kwargs. - def _generate_examples(self, filepath, split: str) -> Tuple[int, Dict]: """Yields examples as (key, example) tuples.""" - # TODO: This method handles input defined in _split_generators to yield (key, example) tuples from the dataset. - - # The `key` is for legacy reasons (tfds) and is not important in itself, but must be unique for each example. - - # NOTE: For local datasets you will have access to self.config.data_dir and self.config.data_files - + if self.config.schema == 'source': with open(filepath, "r") as f: id_value = None @@ -376,11 +275,5 @@ def _generate_examples(self, filepath, split: str) -> Tuple[int, Dict]: key += 1 -# This template is based on the following template from the datasets package: -# https://github.com/huggingface/datasets/blob/master/templates/new_dataset_script.py - - -# This allows you to run your dataloader with `python flambe.py` during development -# TODO: Remove this before making your PR if __name__ == "__main__": datasets.load_dataset(__file__) From 5ef9d64688ec3171b6e2710943a6e6cbdddd8cb8 Mon Sep 17 00:00:00 2001 From: Jeffrey Zhong <20jeffreyzhong02@gmail.com> Date: Mon, 3 Jun 2024 23:58:06 -0700 Subject: [PATCH 4/7] format --- bigbio/hub/hub_repos/flambe/flambe.py | 74 ++++++++++++--------------- 1 file changed, 34 insertions(+), 40 deletions(-) diff --git a/bigbio/hub/hub_repos/flambe/flambe.py b/bigbio/hub/hub_repos/flambe/flambe.py index f854cb268..fe0bed9fd 100644 --- a/bigbio/hub/hub_repos/flambe/flambe.py +++ b/bigbio/hub/hub_repos/flambe/flambe.py @@ -15,14 +15,12 @@ import os -from typing import List, Tuple, Dict import re +from typing import Dict, List, Tuple import datasets -from .bigbiohub import BigBioConfig -from .bigbiohub import Tasks -from .bigbiohub import kb_features +from .bigbiohub import BigBioConfig, Tasks, kb_features _LOCAL = False @@ -56,18 +54,20 @@ _URLS = { _DATASETNAME: "https://zenodo.org/records/10050681/files/data.zip?download", - "ned": {"tissue_test": "https://zenodo.org/records/11218662/files/tissue_ned_test.csv?download", - "tissue_train": "https://zenodo.org/records/11218662/files/tissue_ned_train.csv?download", - "tissue_val": "https://zenodo.org/records/11218662/files/tissue_ned_val.csv?download", - "tool_test": "https://zenodo.org/records/11218662/files/tool_ned_test.csv?download", - "tool_train": "https://zenodo.org/records/11218662/files/tool_ned_train.csv?download", - "tool_val" : "https://zenodo.org/records/11218662/files/tool_ned_val.csv?download" - }, + "ned": { + "tissue_test": "https://zenodo.org/records/11218662/files/tissue_ned_test.csv?download", + "tissue_train": "https://zenodo.org/records/11218662/files/tissue_ned_train.csv?download", + "tissue_val": "https://zenodo.org/records/11218662/files/tissue_ned_val.csv?download", + "tool_test": "https://zenodo.org/records/11218662/files/tool_ned_test.csv?download", + "tool_train": "https://zenodo.org/records/11218662/files/tool_ned_train.csv?download", + "tool_val": "https://zenodo.org/records/11218662/files/tool_ned_val.csv?download", + }, } -_SUPPORTED_TASKS = [Tasks.NAMED_ENTITY_RECOGNITION, - Tasks.NAMED_ENTITY_DISAMBIGUATION, - ] +_SUPPORTED_TASKS = [ + Tasks.NAMED_ENTITY_RECOGNITION, + Tasks.NAMED_ENTITY_DISAMBIGUATION, +] _SOURCE_VERSION = "1.0.0" _BIGBIO_VERSION = "1.0.0" @@ -120,10 +120,7 @@ class FlambeDataset(datasets.GeneratorBasedBuilder): DEFAULT_CONFIG_NAME = "flambe_ner_fulltext_tools_source" def _info(self) -> datasets.DatasetInfo: - - if self.config.schema == "source": - features = datasets.Features( { "id": datasets.Value("string"), @@ -131,7 +128,7 @@ def _info(self) -> datasets.DatasetInfo: "tags": datasets.Sequence(datasets.Value("string")), } ) - + elif self.config.schema == "source_ned_tissue": features = datasets.Features( { @@ -140,7 +137,7 @@ def _info(self) -> datasets.DatasetInfo: "NCIT_name": datasets.Value("string"), } ) - + elif self.config.schema == "source_ned_tool": features = datasets.Features( { @@ -152,7 +149,7 @@ def _info(self) -> datasets.DatasetInfo: elif self.config.schema == "bigbio_kb": features = kb_features - + return datasets.DatasetInfo( description=_DESCRIPTION, features=features, @@ -164,9 +161,8 @@ def _info(self) -> datasets.DatasetInfo: def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: """Returns SplitGenerators.""" - # TODO: KEEP if your dataset is PUBLIC; remove if not - + # TODO: KEEP if your dataset is PUBLIC; remove if not urls = _URLS[_DATASETNAME] data_dir = dl_manager.download_and_extract(urls) @@ -187,17 +183,16 @@ def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: "test": os.path.join(data_dir, "data/tags/abstract_iob/abstract_tissues_test.iob"), "dev": os.path.join(data_dir, "data/tags/abstract_iob/abstract_tissues_validation.iob"), }, - "flambe_ned_tissues" : { + "flambe_ned_tissues": { "train": dl_manager.download_and_extract(_URLS["ned"]["tissue_train"]), "test": dl_manager.download_and_extract(_URLS["ned"]["tissue_test"]), - "dev": dl_manager.download_and_extract(_URLS["ned"]["tissue_val"]), - - }, - "flambe_ned_tools" : { + "dev": dl_manager.download_and_extract(_URLS["ned"]["tissue_val"]), + }, + "flambe_ned_tools": { "train": dl_manager.download_and_extract(_URLS["ned"]["tool_train"]), "test": dl_manager.download_and_extract(_URLS["ned"]["tool_test"]), - "dev": dl_manager.download_and_extract(_URLS["ned"]["tool_val"]), - } + "dev": dl_manager.download_and_extract(_URLS["ned"]["tool_val"]), + }, } return [ @@ -224,11 +219,10 @@ def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: ), ] - def _generate_examples(self, filepath, split: str) -> Tuple[int, Dict]: """Yields examples as (key, example) tuples.""" - - if self.config.schema == 'source': + + if self.config.schema == "source": with open(filepath, "r") as f: id_value = None tokens = [] @@ -241,13 +235,13 @@ def _generate_examples(self, filepath, split: str) -> Tuple[int, Dict]: if parts[1] == "begin": if id_value is not None: yield key, {"id": id_value, "tokens": tokens, "tags": tags} - key += 1 + key += 1 tokens = [] tags = [] id_value = parts[0] elif parts[1] == "end": yield key, {"id": id_value, "tokens": tokens, "tags": tags} - key += 1 + key += 1 id_value = None tokens = [] tags = [] @@ -259,19 +253,19 @@ def _generate_examples(self, filepath, split: str) -> Tuple[int, Dict]: key += 1 elif self.config.schema == "source_ned_tissue": - key = 0 - for line in open(filepath): - csv_row = line.strip('\n').split(",") + key = 0 + for line in open(filepath): + csv_row = line.strip("\n").split(",") if csv_row is not None: - yield key, { "orginal_text": csv_row[0], "mapped_NCIT": csv_row[1], "NCIT_name": csv_row[2]} + yield key, {"orginal_text": csv_row[0], "mapped_NCIT": csv_row[1], "NCIT_name": csv_row[2]} key += 1 elif self.config.schema == "source_ned_tool": key = 0 for line in open(filepath): - csv_row = line.strip('\n').split(",") + csv_row = line.strip("\n").split(",") if csv_row is not None: - yield key, { "orginal_text": csv_row[0], "standardized_name": csv_row[1], "url": csv_row[2]} + yield key, {"orginal_text": csv_row[0], "standardized_name": csv_row[1], "url": csv_row[2]} key += 1 From 64537acc64b98407fb2a8e20c69af14cdce927b5 Mon Sep 17 00:00:00 2001 From: Jeffrey Zhong <20jeffreyzhong02@gmail.com> Date: Tue, 4 Jun 2024 01:06:27 -0700 Subject: [PATCH 5/7] fixed a bug --- bigbio/hub/hub_repos/flambe/flambe.py | 80 +++++++++++++++++++++++++-- 1 file changed, 76 insertions(+), 4 deletions(-) diff --git a/bigbio/hub/hub_repos/flambe/flambe.py b/bigbio/hub/hub_repos/flambe/flambe.py index fe0bed9fd..ffeab1c2b 100644 --- a/bigbio/hub/hub_repos/flambe/flambe.py +++ b/bigbio/hub/hub_repos/flambe/flambe.py @@ -20,9 +20,13 @@ import datasets -from .bigbiohub import BigBioConfig, Tasks, kb_features +from bigbio.utils import schemas + +from .bigbiohub import BigBioConfig, Tasks _LOCAL = False +_LANGUAGES = ["English"] +_PUBMED = False _CITATION = """\ @inproceedings{, @@ -50,7 +54,7 @@ _HOMEPAGE = "https://github.com/ylaboratory/flambe" -_LICENSE = "Creative Commons Attribution 4.0 International" +_LICENSE = "CC_BY_4p0" _URLS = { _DATASETNAME: "https://zenodo.org/records/10050681/files/data.zip?download", @@ -115,6 +119,20 @@ class FlambeDataset(datasets.GeneratorBasedBuilder): schema="source_ned_tool", subset_id="flambe_ned_tools", ), + BigBioConfig( + name="flambe_fulltext_tools_bigbio_text", + version=BIGBIO_VERSION, + description="Flambe Tissues BigBio schema", + schema="bigbio_text", + subset_id="flambe_tool_bigbio", + ), + BigBioConfig( + name="flambe_fulltext_tissues_bigbio_text", + version=BIGBIO_VERSION, + description="Flambe Tool BigBio schema", + schema="bigbio_text", + subset_id="flambe_tissue_bigbio", + ), ] DEFAULT_CONFIG_NAME = "flambe_ner_fulltext_tools_source" @@ -147,8 +165,8 @@ def _info(self) -> datasets.DatasetInfo: } ) - elif self.config.schema == "bigbio_kb": - features = kb_features + elif self.config.schema == "bigbio_text": + features = schemas.text_features return datasets.DatasetInfo( description=_DESCRIPTION, @@ -193,6 +211,16 @@ def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: "test": dl_manager.download_and_extract(_URLS["ned"]["tool_test"]), "dev": dl_manager.download_and_extract(_URLS["ned"]["tool_val"]), }, + "flambe_fulltext_tools_bigbio_text": { + "train": os.path.join(data_dir, "data/tags/fulltext_iob/fulltext_tools_train.iob"), + "test": os.path.join(data_dir, "data/tags/fulltext_iob/fulltext_tools_test.iob"), + "dev": os.path.join(data_dir, "data/tags/fulltext_iob/fulltext_tools_validation.iob"), + }, + "flambe_fulltext_tissues_bigbio_text": { + "train": os.path.join(data_dir, "data/tags/fulltext_iob/fulltext_tissues_train.iob"), + "test": os.path.join(data_dir, "data/tags/fulltext_iob/fulltext_tissues_test.iob"), + "dev": os.path.join(data_dir, "data/tags/fulltext_iob/fulltext_tissues_validation.iob"), + }, } return [ @@ -251,6 +279,50 @@ def _generate_examples(self, filepath, split: str) -> Tuple[int, Dict]: if id_value is not None: yield key, {"id": id_value, "tokens": tokens, "tags": tags} key += 1 + elif self.config.schema == "bigbio_text": + with open(filepath, "r") as f: + id_value = None + tokens = [] + tags = [] + key = 0 + for line in f: + line = line.strip() + if line: + parts = line.split() + if parts[1] == "begin": + if id_value is not None: + yield key, { + "id": key, + "document_id": id_value, + "text": " ".join(tokens), + "labels": tags, + } + key += 1 + tokens = [] + tags = [] + id_value = parts[0] + elif parts[1] == "end": + yield key, { + "id": key, + "document_id": id_value, + "text": " ".join(tokens), + "labels": tags, + } + key += 1 + id_value = None + tokens = [] + tags = [] + else: + tokens.append(parts[0]) + tags.append(parts[1]) + if id_value is not None: + yield key, { + "id": key, + "document_id": id_value, + "text": " ".join(tokens), + "labels": tags, + } + key += 1 elif self.config.schema == "source_ned_tissue": key = 0 From 65e1653c39d60398eb5099483e3299cf41f069ff Mon Sep 17 00:00:00 2001 From: Jeffrey Zhong <20jeffreyzhong02@gmail.com> Date: Tue, 4 Jun 2024 13:10:18 -0700 Subject: [PATCH 6/7] added abstracts with bigbio text schema --- bigbio/hub/hub_repos/flambe/flambe.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/bigbio/hub/hub_repos/flambe/flambe.py b/bigbio/hub/hub_repos/flambe/flambe.py index ffeab1c2b..fd3b53ef8 100644 --- a/bigbio/hub/hub_repos/flambe/flambe.py +++ b/bigbio/hub/hub_repos/flambe/flambe.py @@ -133,6 +133,13 @@ class FlambeDataset(datasets.GeneratorBasedBuilder): schema="bigbio_text", subset_id="flambe_tissue_bigbio", ), + BigBioConfig( + name="flambe_abstract_tissues_bigbio_text", + version=BIGBIO_VERSION, + description="Flambe Tool BigBio schema", + schema="bigbio_text", + subset_id="flambe_tissue_bigbio", + ), ] DEFAULT_CONFIG_NAME = "flambe_ner_fulltext_tools_source" @@ -221,6 +228,11 @@ def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: "test": os.path.join(data_dir, "data/tags/fulltext_iob/fulltext_tissues_test.iob"), "dev": os.path.join(data_dir, "data/tags/fulltext_iob/fulltext_tissues_validation.iob"), }, + "flambe_abstract_tissues_bigbio_text": { + "train": os.path.join(data_dir, "data/tags/abstract_iob/abstract_tissues_train.iob"), + "test": os.path.join(data_dir, "data/tags/abstract_iob/abstract_tissues_test.iob"), + "dev": os.path.join(data_dir, "data/tags/abstract_iob/abstract_tissues_validation.iob"), + }, } return [ From 1c98993da740a48a51d09a12b68a0cae6cca1942 Mon Sep 17 00:00:00 2001 From: Jeffrey Zhong <20jeffreyzhong02@gmail.com> Date: Tue, 4 Jun 2024 16:03:39 -0700 Subject: [PATCH 7/7] fixed bigbio dependency bug --- bigbio/hub/hub_repos/flambe/flambe.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/bigbio/hub/hub_repos/flambe/flambe.py b/bigbio/hub/hub_repos/flambe/flambe.py index fd3b53ef8..3d157bbf8 100644 --- a/bigbio/hub/hub_repos/flambe/flambe.py +++ b/bigbio/hub/hub_repos/flambe/flambe.py @@ -20,9 +20,7 @@ import datasets -from bigbio.utils import schemas - -from .bigbiohub import BigBioConfig, Tasks +from .bigbiohub import BigBioConfig, Tasks, text_features _LOCAL = False _LANGUAGES = ["English"] @@ -173,7 +171,7 @@ def _info(self) -> datasets.DatasetInfo: ) elif self.config.schema == "bigbio_text": - features = schemas.text_features + features = text_features return datasets.DatasetInfo( description=_DESCRIPTION,