From 4c81e912b391d1992bd187f6419813da0a3efd6b Mon Sep 17 00:00:00 2001 From: Gully Burns <45613102+GullyBurns@users.noreply.github.com> Date: Tue, 26 Sep 2023 06:21:32 -0700 Subject: [PATCH] Closes #892 (#893) * Updates to czi_drsm dataset * Updates to czi_drsm dataset * Running all tests * Updating documentation --- bigbio/hub/hub_repos/czi_drsm/README.md | 68 +++ bigbio/hub/hub_repos/czi_drsm/__init__.py | 0 bigbio/hub/hub_repos/czi_drsm/bigbiohub.py | 590 +++++++++++++++++++++ bigbio/hub/hub_repos/czi_drsm/czi_drsm.py | 410 ++++++++++++++ 4 files changed, 1068 insertions(+) create mode 100644 bigbio/hub/hub_repos/czi_drsm/README.md create mode 100644 bigbio/hub/hub_repos/czi_drsm/__init__.py create mode 100644 bigbio/hub/hub_repos/czi_drsm/bigbiohub.py create mode 100644 bigbio/hub/hub_repos/czi_drsm/czi_drsm.py diff --git a/bigbio/hub/hub_repos/czi_drsm/README.md b/bigbio/hub/hub_repos/czi_drsm/README.md new file mode 100644 index 000000000..7b7e6ee6b --- /dev/null +++ b/bigbio/hub/hub_repos/czi_drsm/README.md @@ -0,0 +1,68 @@ +--- +language: + - en +bigbio_language: + - English +license: Creative Commons Zero v1.0 Universal +bigbio_license_shortname: cc0-1.0 +multilinguality: monolingual +pretty_name: CZI DRSM +homepage: https://github.com/chanzuckerberg/DRSM-corpus +bigbio_pubmed: false +bigbio_public: true +bigbio_tasks: + - TXTCLASS +--- + +# Dataset Card for CZI DRSM + +## Dataset Description + +- **Homepage:** https://github.com/chanzuckerberg/DRSM-corpus +- **Pubmed:** False +- **Public:** True +- **Tasks:** TXTCLASS + +Research Article document classification dataset based on aspects of disease research. Currently, the dataset consists of three subsets: + +(A) classifies title/abstracts of papers into most popular subtypes of clinical, basic, and translational papers (~20k papers); + - Clinical Characteristics, Disease Pathology, and Diagnosis - + Text that describes (A) symptoms, signs, or ‘phenotype’ of a disease; + (B) the effects of the disease on patient organs, tissues, or cells; + (C) the results of clinical tests that reveal pathology (including + biomarkers); (D) research that use this information to figure out + a diagnosis. + - Therapeutics in the clinic - + Text describing how treatments work in the clinic (but not in a clinical trial). + - Disease mechanism - + Text that describes either (A) mechanistic involvement of specific genes in disease + (deletions, gain of function, etc); (B) how molecular signalling or metabolism + binding, activating, phosphorylation, concentration increase, etc.) + are involved in the mechanism of a disease; or (C) the physiological + mechanism of disease at the level of tissues, organs, and body systems. + - Patient-Based Therapeutics - + Text describing (A) Clinical trials (studies of therapeutic measures being + used on patients in a clinical trial); (B) Post Marketing Drug Surveillance + (effects of a drug after approval in the general population or as part of + ‘standard healthcare’); (C) Drug repurposing (how a drug that has been + approved for one use is being applied to a new disease). + +(B) identifies whether a title/abstract of a paper describes substantive research into Quality of Life (~10k papers); + - -1 - the paper is not a primary experimental study in rare disease + - 0 - the study does not directly investigate quality of life + - 1 - the study investigates qol but not as its primary contribution + - 2 - the study's primary contribution centers on quality of life measures + +(C) identifies if a paper is a natural history study (~10k papers). + - -1 - the paper is not a primary experimental study in rare disease + - 0 - the study is not directly investigating the natural history of a disease + - 1 - the study includes some elements a natural history but not as its primary contribution + - 2 - the study's primary contribution centers on observing the time course of a rare disease + +These classifications are particularly relevant in rare disease research, a field that is generally understudied. + +## Citation Information + +``` +# N/A +``` diff --git a/bigbio/hub/hub_repos/czi_drsm/__init__.py b/bigbio/hub/hub_repos/czi_drsm/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/bigbio/hub/hub_repos/czi_drsm/bigbiohub.py b/bigbio/hub/hub_repos/czi_drsm/bigbiohub.py new file mode 100644 index 000000000..f4da7bb78 --- /dev/null +++ b/bigbio/hub/hub_repos/czi_drsm/bigbiohub.py @@ -0,0 +1,590 @@ +from collections import defaultdict +from dataclasses import dataclass +from enum import Enum +import logging +from pathlib import Path +from types import SimpleNamespace +from typing import TYPE_CHECKING, Dict, Iterable, List, Tuple + +import datasets + +if TYPE_CHECKING: + import bioc + +logger = logging.getLogger(__name__) + + +BigBioValues = SimpleNamespace(NULL="") + + +@dataclass +class BigBioConfig(datasets.BuilderConfig): + """BuilderConfig for BigBio.""" + + name: str = None + version: datasets.Version = None + description: str = None + schema: str = None + subset_id: str = None + + +class Tasks(Enum): + NAMED_ENTITY_RECOGNITION = "NER" + NAMED_ENTITY_DISAMBIGUATION = "NED" + EVENT_EXTRACTION = "EE" + RELATION_EXTRACTION = "RE" + COREFERENCE_RESOLUTION = "COREF" + QUESTION_ANSWERING = "QA" + TEXTUAL_ENTAILMENT = "TE" + SEMANTIC_SIMILARITY = "STS" + TEXT_PAIRS_CLASSIFICATION = "TXT2CLASS" + PARAPHRASING = "PARA" + TRANSLATION = "TRANSL" + SUMMARIZATION = "SUM" + TEXT_CLASSIFICATION = "TXTCLASS" + + +entailment_features = datasets.Features( + { + "id": datasets.Value("string"), + "premise": datasets.Value("string"), + "hypothesis": datasets.Value("string"), + "label": datasets.Value("string"), + } +) + +pairs_features = datasets.Features( + { + "id": datasets.Value("string"), + "document_id": datasets.Value("string"), + "text_1": datasets.Value("string"), + "text_2": datasets.Value("string"), + "label": datasets.Value("string"), + } +) + +qa_features = datasets.Features( + { + "id": datasets.Value("string"), + "question_id": datasets.Value("string"), + "document_id": datasets.Value("string"), + "question": datasets.Value("string"), + "type": datasets.Value("string"), + "choices": [datasets.Value("string")], + "context": datasets.Value("string"), + "answer": datasets.Sequence(datasets.Value("string")), + } +) + +text_features = datasets.Features( + { + "id": datasets.Value("string"), + "document_id": datasets.Value("string"), + "text": datasets.Value("string"), + "labels": [datasets.Value("string")], + } +) + +text2text_features = datasets.Features( + { + "id": datasets.Value("string"), + "document_id": datasets.Value("string"), + "text_1": datasets.Value("string"), + "text_2": datasets.Value("string"), + "text_1_name": datasets.Value("string"), + "text_2_name": datasets.Value("string"), + } +) + +kb_features = datasets.Features( + { + "id": datasets.Value("string"), + "document_id": datasets.Value("string"), + "passages": [ + { + "id": datasets.Value("string"), + "type": datasets.Value("string"), + "text": datasets.Sequence(datasets.Value("string")), + "offsets": datasets.Sequence([datasets.Value("int32")]), + } + ], + "entities": [ + { + "id": datasets.Value("string"), + "type": datasets.Value("string"), + "text": datasets.Sequence(datasets.Value("string")), + "offsets": datasets.Sequence([datasets.Value("int32")]), + "normalized": [ + { + "db_name": datasets.Value("string"), + "db_id": datasets.Value("string"), + } + ], + } + ], + "events": [ + { + "id": datasets.Value("string"), + "type": datasets.Value("string"), + # refers to the text_bound_annotation of the trigger + "trigger": { + "text": datasets.Sequence(datasets.Value("string")), + "offsets": datasets.Sequence([datasets.Value("int32")]), + }, + "arguments": [ + { + "role": datasets.Value("string"), + "ref_id": datasets.Value("string"), + } + ], + } + ], + "coreferences": [ + { + "id": datasets.Value("string"), + "entity_ids": datasets.Sequence(datasets.Value("string")), + } + ], + "relations": [ + { + "id": datasets.Value("string"), + "type": datasets.Value("string"), + "arg1_id": datasets.Value("string"), + "arg2_id": datasets.Value("string"), + "normalized": [ + { + "db_name": datasets.Value("string"), + "db_id": datasets.Value("string"), + } + ], + } + ], + } +) + + +TASK_TO_SCHEMA = { + Tasks.NAMED_ENTITY_RECOGNITION.name: "KB", + Tasks.NAMED_ENTITY_DISAMBIGUATION.name: "KB", + Tasks.EVENT_EXTRACTION.name: "KB", + Tasks.RELATION_EXTRACTION.name: "KB", + Tasks.COREFERENCE_RESOLUTION.name: "KB", + Tasks.QUESTION_ANSWERING.name: "QA", + Tasks.TEXTUAL_ENTAILMENT.name: "TE", + Tasks.SEMANTIC_SIMILARITY.name: "PAIRS", + Tasks.TEXT_PAIRS_CLASSIFICATION.name: "PAIRS", + Tasks.PARAPHRASING.name: "T2T", + Tasks.TRANSLATION.name: "T2T", + Tasks.SUMMARIZATION.name: "T2T", + Tasks.TEXT_CLASSIFICATION.name: "TEXT", +} + +SCHEMA_TO_TASKS = defaultdict(set) +for task, schema in TASK_TO_SCHEMA.items(): + SCHEMA_TO_TASKS[schema].add(task) +SCHEMA_TO_TASKS = dict(SCHEMA_TO_TASKS) + +VALID_TASKS = set(TASK_TO_SCHEMA.keys()) +VALID_SCHEMAS = set(TASK_TO_SCHEMA.values()) + +SCHEMA_TO_FEATURES = { + "KB": kb_features, + "QA": qa_features, + "TE": entailment_features, + "T2T": text2text_features, + "TEXT": text_features, + "PAIRS": pairs_features, +} + + +def get_texts_and_offsets_from_bioc_ann(ann: "bioc.BioCAnnotation") -> Tuple: + + offsets = [(loc.offset, loc.offset + loc.length) for loc in ann.locations] + + text = ann.text + + if len(offsets) > 1: + i = 0 + texts = [] + for start, end in offsets: + chunk_len = end - start + texts.append(text[i : chunk_len + i]) + i += chunk_len + while i < len(text) and text[i] == " ": + i += 1 + else: + texts = [text] + + return offsets, texts + + +def remove_prefix(a: str, prefix: str) -> str: + if a.startswith(prefix): + a = a[len(prefix) :] + return a + + +def parse_brat_file( + txt_file: Path, + annotation_file_suffixes: List[str] = None, + parse_notes: bool = False, +) -> Dict: + """ + Parse a brat file into the schema defined below. + `txt_file` should be the path to the brat '.txt' file you want to parse, e.g. 'data/1234.txt' + Assumes that the annotations are contained in one or more of the corresponding '.a1', '.a2' or '.ann' files, + e.g. 'data/1234.ann' or 'data/1234.a1' and 'data/1234.a2'. + Will include annotator notes, when `parse_notes == True`. + brat_features = datasets.Features( + { + "id": datasets.Value("string"), + "document_id": datasets.Value("string"), + "text": datasets.Value("string"), + "text_bound_annotations": [ # T line in brat, e.g. type or event trigger + { + "offsets": datasets.Sequence([datasets.Value("int32")]), + "text": datasets.Sequence(datasets.Value("string")), + "type": datasets.Value("string"), + "id": datasets.Value("string"), + } + ], + "events": [ # E line in brat + { + "trigger": datasets.Value( + "string" + ), # refers to the text_bound_annotation of the trigger, + "id": datasets.Value("string"), + "type": datasets.Value("string"), + "arguments": datasets.Sequence( + { + "role": datasets.Value("string"), + "ref_id": datasets.Value("string"), + } + ), + } + ], + "relations": [ # R line in brat + { + "id": datasets.Value("string"), + "head": { + "ref_id": datasets.Value("string"), + "role": datasets.Value("string"), + }, + "tail": { + "ref_id": datasets.Value("string"), + "role": datasets.Value("string"), + }, + "type": datasets.Value("string"), + } + ], + "equivalences": [ # Equiv line in brat + { + "id": datasets.Value("string"), + "ref_ids": datasets.Sequence(datasets.Value("string")), + } + ], + "attributes": [ # M or A lines in brat + { + "id": datasets.Value("string"), + "type": datasets.Value("string"), + "ref_id": datasets.Value("string"), + "value": datasets.Value("string"), + } + ], + "normalizations": [ # N lines in brat + { + "id": datasets.Value("string"), + "type": datasets.Value("string"), + "ref_id": datasets.Value("string"), + "resource_name": datasets.Value( + "string" + ), # Name of the resource, e.g. "Wikipedia" + "cuid": datasets.Value( + "string" + ), # ID in the resource, e.g. 534366 + "text": datasets.Value( + "string" + ), # Human readable description/name of the entity, e.g. "Barack Obama" + } + ], + ### OPTIONAL: Only included when `parse_notes == True` + "notes": [ # # lines in brat + { + "id": datasets.Value("string"), + "type": datasets.Value("string"), + "ref_id": datasets.Value("string"), + "text": datasets.Value("string"), + } + ], + }, + ) + """ + + example = {} + example["document_id"] = txt_file.with_suffix("").name + with txt_file.open() as f: + example["text"] = f.read() + + # If no specific suffixes of the to-be-read annotation files are given - take standard suffixes + # for event extraction + if annotation_file_suffixes is None: + annotation_file_suffixes = [".a1", ".a2", ".ann"] + + if len(annotation_file_suffixes) == 0: + raise AssertionError( + "At least one suffix for the to-be-read annotation files should be given!" + ) + + ann_lines = [] + for suffix in annotation_file_suffixes: + annotation_file = txt_file.with_suffix(suffix) + if annotation_file.exists(): + with annotation_file.open() as f: + ann_lines.extend(f.readlines()) + + example["text_bound_annotations"] = [] + example["events"] = [] + example["relations"] = [] + example["equivalences"] = [] + example["attributes"] = [] + example["normalizations"] = [] + + if parse_notes: + example["notes"] = [] + + for line in ann_lines: + line = line.strip() + if not line: + continue + + if line.startswith("T"): # Text bound + ann = {} + fields = line.split("\t") + + ann["id"] = fields[0] + ann["type"] = fields[1].split()[0] + ann["offsets"] = [] + span_str = remove_prefix(fields[1], (ann["type"] + " ")) + text = fields[2] + for span in span_str.split(";"): + start, end = span.split() + ann["offsets"].append([int(start), int(end)]) + + # Heuristically split text of discontiguous entities into chunks + ann["text"] = [] + if len(ann["offsets"]) > 1: + i = 0 + for start, end in ann["offsets"]: + chunk_len = end - start + ann["text"].append(text[i : chunk_len + i]) + i += chunk_len + while i < len(text) and text[i] == " ": + i += 1 + else: + ann["text"] = [text] + + example["text_bound_annotations"].append(ann) + + elif line.startswith("E"): + ann = {} + fields = line.split("\t") + + ann["id"] = fields[0] + + ann["type"], ann["trigger"] = fields[1].split()[0].split(":") + + ann["arguments"] = [] + for role_ref_id in fields[1].split()[1:]: + argument = { + "role": (role_ref_id.split(":"))[0], + "ref_id": (role_ref_id.split(":"))[1], + } + ann["arguments"].append(argument) + + example["events"].append(ann) + + elif line.startswith("R"): + ann = {} + fields = line.split("\t") + + ann["id"] = fields[0] + ann["type"] = fields[1].split()[0] + + ann["head"] = { + "role": fields[1].split()[1].split(":")[0], + "ref_id": fields[1].split()[1].split(":")[1], + } + ann["tail"] = { + "role": fields[1].split()[2].split(":")[0], + "ref_id": fields[1].split()[2].split(":")[1], + } + + example["relations"].append(ann) + + # '*' seems to be the legacy way to mark equivalences, + # but I couldn't find any info on the current way + # this might have to be adapted dependent on the brat version + # of the annotation + elif line.startswith("*"): + ann = {} + fields = line.split("\t") + + ann["id"] = fields[0] + ann["ref_ids"] = fields[1].split()[1:] + + example["equivalences"].append(ann) + + elif line.startswith("A") or line.startswith("M"): + ann = {} + fields = line.split("\t") + + ann["id"] = fields[0] + + info = fields[1].split() + ann["type"] = info[0] + ann["ref_id"] = info[1] + + if len(info) > 2: + ann["value"] = info[2] + else: + ann["value"] = "" + + example["attributes"].append(ann) + + elif line.startswith("N"): + ann = {} + fields = line.split("\t") + + ann["id"] = fields[0] + ann["text"] = fields[2] + + info = fields[1].split() + + ann["type"] = info[0] + ann["ref_id"] = info[1] + ann["resource_name"] = info[2].split(":")[0] + ann["cuid"] = info[2].split(":")[1] + example["normalizations"].append(ann) + + elif parse_notes and line.startswith("#"): + ann = {} + fields = line.split("\t") + + ann["id"] = fields[0] + ann["text"] = fields[2] if len(fields) == 3 else BigBioValues.NULL + + info = fields[1].split() + + ann["type"] = info[0] + ann["ref_id"] = info[1] + example["notes"].append(ann) + + return example + + +def brat_parse_to_bigbio_kb(brat_parse: Dict) -> Dict: + """ + Transform a brat parse (conforming to the standard brat schema) obtained with + `parse_brat_file` into a dictionary conforming to the `bigbio-kb` schema (as defined in ../schemas/kb.py) + :param brat_parse: + """ + + unified_example = {} + + # Prefix all ids with document id to ensure global uniqueness, + # because brat ids are only unique within their document + id_prefix = brat_parse["document_id"] + "_" + + # identical + unified_example["document_id"] = brat_parse["document_id"] + unified_example["passages"] = [ + { + "id": id_prefix + "_text", + "type": "abstract", + "text": [brat_parse["text"]], + "offsets": [[0, len(brat_parse["text"])]], + } + ] + + # get normalizations + ref_id_to_normalizations = defaultdict(list) + for normalization in brat_parse["normalizations"]: + ref_id_to_normalizations[normalization["ref_id"]].append( + { + "db_name": normalization["resource_name"], + "db_id": normalization["cuid"], + } + ) + + # separate entities and event triggers + unified_example["events"] = [] + non_event_ann = brat_parse["text_bound_annotations"].copy() + for event in brat_parse["events"]: + event = event.copy() + event["id"] = id_prefix + event["id"] + trigger = next( + tr + for tr in brat_parse["text_bound_annotations"] + if tr["id"] == event["trigger"] + ) + if trigger in non_event_ann: + non_event_ann.remove(trigger) + event["trigger"] = { + "text": trigger["text"].copy(), + "offsets": trigger["offsets"].copy(), + } + for argument in event["arguments"]: + argument["ref_id"] = id_prefix + argument["ref_id"] + + unified_example["events"].append(event) + + unified_example["entities"] = [] + anno_ids = [ref_id["id"] for ref_id in non_event_ann] + for ann in non_event_ann: + entity_ann = ann.copy() + entity_ann["id"] = id_prefix + entity_ann["id"] + entity_ann["normalized"] = ref_id_to_normalizations[ann["id"]] + unified_example["entities"].append(entity_ann) + + # massage relations + unified_example["relations"] = [] + skipped_relations = set() + for ann in brat_parse["relations"]: + if ( + ann["head"]["ref_id"] not in anno_ids + or ann["tail"]["ref_id"] not in anno_ids + ): + skipped_relations.add(ann["id"]) + continue + unified_example["relations"].append( + { + "arg1_id": id_prefix + ann["head"]["ref_id"], + "arg2_id": id_prefix + ann["tail"]["ref_id"], + "id": id_prefix + ann["id"], + "type": ann["type"], + "normalized": [], + } + ) + if len(skipped_relations) > 0: + example_id = brat_parse["document_id"] + logger.info( + f"Example:{example_id}: The `bigbio_kb` schema allows `relations` only between entities." + f" Skip (for now): " + f"{list(skipped_relations)}" + ) + + # get coreferences + unified_example["coreferences"] = [] + for i, ann in enumerate(brat_parse["equivalences"], start=1): + is_entity_cluster = True + for ref_id in ann["ref_ids"]: + if not ref_id.startswith("T"): # not textbound -> no entity + is_entity_cluster = False + elif ref_id not in anno_ids: # event trigger -> no entity + is_entity_cluster = False + if is_entity_cluster: + entity_ids = [id_prefix + i for i in ann["ref_ids"]] + unified_example["coreferences"].append( + {"id": id_prefix + str(i), "entity_ids": entity_ids} + ) + return unified_example diff --git a/bigbio/hub/hub_repos/czi_drsm/czi_drsm.py b/bigbio/hub/hub_repos/czi_drsm/czi_drsm.py new file mode 100644 index 000000000..24f54d457 --- /dev/null +++ b/bigbio/hub/hub_repos/czi_drsm/czi_drsm.py @@ -0,0 +1,410 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and Gully Burns. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Research Article document classification dataset based on aspects of disease research. Currently, the dataset consists of three subsets: + +(A) classifies title/abstracts of papers into most popular subtypes of clinical, basic, and translational papers (~20k papers); + - Clinical Characteristics, Disease Pathology, and Diagnosis - + Text that describes (A) symptoms, signs, or ‘phenotype’ of a disease; + (B) the effects of the disease on patient organs, tissues, or cells; + (C) the results of clinical tests that reveal pathology (including + biomarkers); (D) research that use this information to figure out + a diagnosis. + - Therapeutics in the clinic - + Text describing how treatments work in the clinic (but not in a clinical trial). + - Disease mechanism - + Text that describes either (A) mechanistic involvement of specific genes in disease + (deletions, gain of function, etc); (B) how molecular signalling or metabolism + binding, activating, phosphorylation, concentration increase, etc.) + are involved in the mechanism of a disease; or (C) the physiological + mechanism of disease at the level of tissues, organs, and body systems. + - Patient-Based Therapeutics - + Text describing (A) Clinical trials (studies of therapeutic measures being + used on patients in a clinical trial); (B) Post Marketing Drug Surveillance + (effects of a drug after approval in the general population or as part of + ‘standard healthcare’); (C) Drug repurposing (how a drug that has been + approved for one use is being applied to a new disease). + +(B) identifies whether a title/abstract of a paper describes substantive research into Quality of Life (~10k papers); + - -1 - the paper is not a primary experimental study in rare disease + - 0 - the study does not directly investigate quality of life + - 1 - the study investigates qol but not as its primary contribution + - 2 - the study's primary contribution centers on quality of life measures + +(C) identifies if a paper is a natural history study (~10k papers). +` - -1 - the paper is not a primary experimental study in rare disease + - 0 - the study is not directly investigating the natural history of a disease + - 1 - the study includes some elements a natural history but not as its primary contribution + - 2 - the study's primary contribution centers on observing the time course of a rare disease + +These classifications are particularly relevant in rare disease research, a field that is generally understudied. +""" + +import os +from typing import List, Tuple, Dict + +import datasets +import pandas as pd +from pathlib import Path + +import bigbio.utils.parsing as parse +from bigbio.utils import schemas +from bigbio.utils.configs import BigBioConfig +from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.license import Licenses + +#from .bigbiohub import BigBioConfig +#from .bigbiohub import Tasks + +#from .bigbiohub import + +_LOCAL = False + +_CITATION = """\ +@article{, + author = {}, + title = {}, + journal = {}, + volume = {}, + year = {}, + url = {}, + doi = {}, + biburl = {}, + bibsource = {} +} +""" + +_DATASETNAME = "czi_drsm" + +_DESCRIPTION = """\ +Research Article document classification dataset based on aspects of disease research. Currently, the dataset consists of three subsets: + +(A) classifies title/abstracts of papers into most popular subtypes of clinical, basic, and translational papers (~20k papers); + - Clinical Characteristics, Disease Pathology, and Diagnosis - + Text that describes (A) symptoms, signs, or ‘phenotype’ of a disease; + (B) the effects of the disease on patient organs, tissues, or cells; + (C) the results of clinical tests that reveal pathology (including + biomarkers); (D) research that use this information to figure out + a diagnosis. + - Therapeutics in the clinic - + Text describing how treatments work in the clinic (but not in a clinical trial). + - Disease mechanism - + Text that describes either (A) mechanistic involvement of specific genes in disease + (deletions, gain of function, etc); (B) how molecular signalling or metabolism + binding, activating, phosphorylation, concentration increase, etc.) + are involved in the mechanism of a disease; or (C) the physiological + mechanism of disease at the level of tissues, organs, and body systems. + - Patient-Based Therapeutics - + Text describing (A) Clinical trials (studies of therapeutic measures being + used on patients in a clinical trial); (B) Post Marketing Drug Surveillance + (effects of a drug after approval in the general population or as part of + ‘standard healthcare’); (C) Drug repurposing (how a drug that has been + approved for one use is being applied to a new disease). + +(B) identifies whether a title/abstract of a paper describes substantive research into Quality of Life (~10k papers); + - -1 - the paper is not a primary experimental study in rare disease + - 0 - the study does not directly investigate quality of life + - 1 - the study investigates qol but not as its primary contribution + - 2 - the study's primary contribution centers on quality of life measures + +(C) identifies if a paper is a natural history study (~10k papers). +` - -1 - the paper is not a primary experimental study in rare disease + - 0 - the study is not directly investigating the natural history of a disease + - 1 - the study includes some elements a natural history but not as its primary contribution + - 2 - the study's primary contribution centers on observing the time course of a rare disease + +These classifications are particularly relevant in rare disease research, a field that is generally understudied. +""" + +_HOMEPAGE = "https://github.com/chanzuckerberg/DRSM-corpus/" +_LICENSE = "CC0_1p0" + +_LANGUAGES = ['English'] +_PUBMED = False +_LOCAL = False +_DISPLAYNAME = "DRSM Corpus" + +# For publicly available datasets you will most likely end up passing these URLs to dl_manager in _split_generators. +# In most cases the URLs will be the same for the source and bigbio config. +# However, if you need to access different files for each config you can have multiple entries in this dict. +# This can be an arbitrarily nested dict/list of URLs (see below in `_split_generators` method) +_URLS = { + 'base': "https://raw.githubusercontent.com/chanzuckerberg/DRSM-corpus/main/v1/drsm_corpus_v1.tsv", + 'qol': "https://raw.githubusercontent.com/chanzuckerberg/DRSM-corpus/main/v2/qol_all_2022_12_15.tsv", + 'nhs': "https://raw.githubusercontent.com/chanzuckerberg/DRSM-corpus/main/v2/nhs_all_2023_03_31.tsv" +} + +_SUPPORTED_TASKS = [Tasks.TEXT_CLASSIFICATION] + +_SOURCE_VERSION = "1.0.0" +_BIGBIO_VERSION = "1.0.0" + +_CLASS_NAMES_BASE = [ + "clinical characteristics or disease pathology", + "therapeutics in the clinic", + "disease mechanism", + "patient-based therapeutics", + "other", + "irrelevant" + ] + +_CLASS_NAMES_QOL = [ + "-1 - the paper is not a primary experimental study in rare disease", + "0 - the study does not directly investigate quality of life", + "1 - the study investigates qol but not as its primary contribution", + "2 - the study's primary contribution centers on quality of life measures" + ] + +_CLASS_NAMES_NHS = [ + "-1 - the paper is not a primary experimental study in rare disease", + "0 - the study is not directly investigating the natural history of a disease", + "1 - the study includes some elements a natural history but not as its primary contribution", + "2 - the study's primary contribution centers on observing the time course of a rare disease" + ] + +class DRSMBaseDataset(datasets.GeneratorBasedBuilder): + """DRSM Document Classification Datasets.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + BIGBIO_VERSION = datasets.Version(_BIGBIO_VERSION) + + # You will be able to load the "source" or "bigbio" configurations with + #ds_source = datasets.load_dataset('drsm_source_dataset', name='source') + #ds_bigbio = datasets.load_dataset('drsm_bigbio_dataset', name='bigbio') + + # For local datasets you can make use of the `data_dir` and `data_files` kwargs + # https://huggingface.co/docs/datasets/add_dataset.html#downloading-data-files-and-organizing-splits + # ds_source = datasets.load_dataset('my_dataset', name='source', data_dir="/path/to/data/files") + # ds_bigbio = datasets.load_dataset('my_dataset', name='bigbio', data_dir="/path/to/data/files") + + # TODO: For each dataset, implement Config for Source and BigBio; + # If dataset contains more than one subset (see examples/bioasq.py) implement for EACH of them. + # Each of them should contain: + # - name: should be unique for each dataset config eg. bioasq10b_(source|bigbio)_[bigbio_schema_name] + # - version: option = (SOURCE_VERSION|BIGBIO_VERSION) + # - description: one line description for the dataset + # - schema: options = (source|bigbio_[bigbio_schema_name]) + # - subset_id: subset id is the canonical name for the dataset (eg. bioasq10b) + # where [bigbio_schema_name] = () + + BUILDER_CONFIGS = [ + BigBioConfig( + name="czi_drsm_base_source", + version=SOURCE_VERSION, + description="czi_drsm base source schema", + schema="base_source", + subset_id="czi_drsm_base", + ), + BigBioConfig( + name="czi_drsm_bigbio_base_text", + version=BIGBIO_VERSION, + description="czi_drsm base BigBio schema", + schema="bigbio_text", + subset_id="czi_drsm_base", + ), + BigBioConfig( + name="czi_drsm_qol_source", + version=SOURCE_VERSION, + description="czi_drsm source schema for Quality of Life studies", + schema="qol_source", + subset_id="czi_drsm_qol", + ), + BigBioConfig( + name="czi_drsm_bigbio_qol_text", + version=BIGBIO_VERSION, + description="czi_drsm BigBio schema for Quality of Life studies", + schema="bigbio_text", + subset_id="czi_drsm_qol", + ), + BigBioConfig( + name="czi_drsm_nhs_source", + version=SOURCE_VERSION, + description="czi_drsm source schema for Natural History Studies", + schema="nhs_source", + subset_id="czi_drsm_nhs", + ), + BigBioConfig( + name="czi_drsm_bigbio_nhs_text", + version=BIGBIO_VERSION, + description="czi_drsm BigBio schema for Natural History Studies", + schema="bigbio_text", + subset_id="czi_drsm_nhs", + ), + ] + + DEFAULT_CONFIG_NAME = "czi_drsm_bigbio_base_text" + + def _info(self) -> datasets.DatasetInfo: + + # Create the source schema; this schema will keep all keys/information/labels as close to the original dataset as possible. + + # You can arbitrarily nest lists and dictionaries. + # For iterables, use lists over tuples or `datasets.Sequence` + + if self.config.schema == "base_source": + features = datasets.Features( + { + "document_id": datasets.Value("string"), + "labeling_state": datasets.Value("string"), + "explanation": datasets.Value("string"), + "correct_label": [datasets.ClassLabel(names=_CLASS_NAMES_BASE)], + "agreement": [datasets.Value("string")], + "title": [datasets.Value("string")], + "abstract": [datasets.Value("string")], + } + ) + + elif self.config.schema == "qol_source": + features = datasets.Features( + { + "document_id": datasets.Value("string"), + "labeling_state": datasets.Value("string"), + "correct_label": [datasets.ClassLabel(names=_CLASS_NAMES_QOL)], + "explanation": datasets.Value("string"), + "agreement": [datasets.Value("string")], + "title": [datasets.Value("string")], + "abstract": [datasets.Value("string")] + } + ) + + elif self.config.schema == "nhs_source": + features = datasets.Features( + { + "document_id": datasets.Value("string"), + "labeling_state": datasets.Value("string"), + "correct_label": [datasets.ClassLabel(names=_CLASS_NAMES_NHS)], + "explanation": datasets.Value("string"), + "agreement": [datasets.Value("string")], + "title": [datasets.Value("string")], + "abstract": [datasets.Value("string")], + } + ) + + # For example bigbio_kb, bigbio_t2t + elif self.config.schema == "bigbio_text": + features = schemas.text_features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + + if 'base' in self.config.name: + url = _URLS['base'] + elif 'qol' in self.config.name: + url = _URLS['qol'] + elif 'nhs' in self.config.name: + url = _URLS['nhs'] + else: + raise ValueError("Invalid config name: {}".format(self.config.name)) + + data_file = dl_manager.download_and_extract(url) + df = pd.read_csv(data_file, sep="\t", encoding="utf-8").fillna('') + + # load tsv file into huggingface dataset + ds = datasets.Dataset.from_pandas(df) + + # generate train_test split + ds_dict = ds.train_test_split(test_size=0.2, seed=42) + ds_dict2 = ds_dict['test'].train_test_split(test_size=0.5, seed=42) + + # dump train, val, test to disk + data_dir = Path(data_file).parent + ds_dict['train'].to_csv(data_dir / "train.tsv", sep="\t", index=False) + ds_dict2['train'].to_csv(data_dir / "validation.tsv", sep="\t", index=False) + ds_dict2['test'].to_csv(data_dir / "test.tsv", sep="\t", index=False) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": data_dir / "train.tsv", + "split": "train", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={ + "filepath": data_dir / "validation.tsv", + "split": "validation", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "filepath": data_dir / "test.tsv", + "split": "test", + }, + ) + ] + + # method parameters are unpacked from `gen_kwargs` as given in `_split_generators` + def _generate_examples(self, filepath, split) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + df = pd.read_csv(filepath, sep="\t", encoding="utf-8").fillna('') + print(len(df)) + for id_, l in df.iterrows(): + if self.config.subset_id == "czi_drsm_base": + doc_id = l[0] + labeling_state = l[1] + correct_label = l[2] + agreement = l[3] + explanation = l[4] + title = l[5] + abstract = l[6] + elif self.config.subset_id == "czi_drsm_qol": + doc_id = l[0] + labeling_state = l[1] + correct_label = l[2][1:-1] + explanation = l[3] + agreement = l[4] + title = l[5] + abstract = l[6] + elif self.config.subset_id == "czi_drsm_nhs": + doc_id = l[0] + labeling_state = l[1] + correct_label = l[2][1:-1] + explanation = '' + agreement = l[3] + title = l[4] + abstract = l[5] + + if "_source" in self.config.schema: + yield id_, { + "document_id": doc_id, + "labeling_state": labeling_state, + "explanation": explanation, + "correct_label": [correct_label], + "agreement": str(agreement), + "title": title, + "abstract": abstract + } + elif self.config.schema == "bigbio_text": + yield id_, { + "id": id_, + "document_id": doc_id, + "text": title + " " + abstract, + "labels": [correct_label] + } + +# This template is based on the following template from the datasets package: +# https://github.com/huggingface/datasets/blob/master/templates/new_dataset_script.py