diff --git a/bigbio/biodatasets/thomas2011/thomas2011.py b/bigbio/biodatasets/thomas2011/thomas2011.py
index 427c4be2..e5c5734d 100644
--- a/bigbio/biodatasets/thomas2011/thomas2011.py
+++ b/bigbio/biodatasets/thomas2011/thomas2011.py
@@ -43,15 +43,16 @@
 from pathlib import Path
 from shutil import rmtree
 from typing import Dict, List, Tuple
-
+import xml.etree.ElementTree as ET
 import datasets
 import pandas as pd
+import requests
 
 from bigbio.utils import schemas
 from bigbio.utils.configs import BigBioConfig
 from bigbio.utils.constants import Lang, Tasks
 from bigbio.utils.license import CustomLicense
-
+import time
 
 _LANGUAGES = [Lang.EN]
 _PUBMED = True
@@ -98,8 +99,7 @@
 # this is a backup url in case the official one will stop working
 # _URLS = ["http://github.com/rockt/SETH/zipball/master/"]
 _URLS = {
-    "source": "https://www.scai.fraunhofer.de/content/dam/scai/de/downloads/bioinformatik/normalization-variation-corpus.gz",
-    "bigbio_kb": "https://www.scai.fraunhofer.de/content/dam/scai/de/downloads/bioinformatik/normalization-variation-corpus.gz",
+    _DATASETNAME: "https://www.scai.fraunhofer.de/content/dam/scai/de/downloads/bioinformatik/normalization-variation-corpus.gz",
 }
 
 _SUPPORTED_TASKS = [
@@ -117,35 +117,16 @@ class Thomas2011Dataset(datasets.GeneratorBasedBuilder):
     SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
     BIGBIO_VERSION = datasets.Version(_BIGBIO_VERSION)
 
-    # You will be able to load the "source" or "bigbio" configurations with
-    # ds_source = datasets.load_dataset('my_dataset', name='source')
-    # ds_bigbio = datasets.load_dataset('my_dataset', name='bigbio')
-
-    # For local datasets you can make use of the `data_dir` and `data_files` kwargs
-    # https://huggingface.co/docs/datasets/add_dataset.html#downloading-data-files-and-organizing-splits
-    # ds_source = datasets.load_dataset('my_dataset', name='source', data_dir="/path/to/data/files")
-    # ds_bigbio = datasets.load_dataset('my_dataset', name='bigbio', data_dir="/path/to/data/files")
-
-    # TODO: For each dataset, implement Config for Source and BigBio;
-    #  If dataset contains more than one subset (see examples/bioasq.py) implement for EACH of them.
-    #  Each of them should contain:
-    #   - name: should be unique for each dataset config eg. bioasq10b_(source|bigbio)_[bigbio_schema_name]
-    #   - version: option = (SOURCE_VERSION|BIGBIO_VERSION)
-    #   - description: one line description for the dataset
-    #   - schema: options = (source|bigbio_[bigbio_schema_name])
-    #   - subset_id: subset id is the canonical name for the dataset (eg. bioasq10b)
-    #  where [bigbio_schema_name] = (kb, pairs, qa, text, t2t, entailment)
-
     BUILDER_CONFIGS = [
         BigBioConfig(
-            name="thomas2011_source",
+            name=f"{_DATASETNAME}_source",
             version=SOURCE_VERSION,
             description="Thomas et al 2011 source schema",
             schema="source",
             subset_id="thomas2011",
         ),
         BigBioConfig(
-            name="thomas2011_bigbio_kb",
+            name=f"{_DATASETNAME}_bigbio_kb",
             version=BIGBIO_VERSION,
             description="Thomas et al 2011 BigBio schema",
             schema="bigbio_kb",
@@ -153,16 +134,10 @@ class Thomas2011Dataset(datasets.GeneratorBasedBuilder):
         ),
     ]
 
-    DEFAULT_CONFIG_NAME = "thomas2011_source"
+    DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source"
 
     def _info(self) -> datasets.DatasetInfo:
 
-        # Create the source schema; this schema will keep all keys/information/labels as close to the original dataset as possible.
-        # Much of this design is copied from biodatasets/verspoor_2013/verspoor_2013.py
-
-        # You can arbitrarily nest lists and dictionaries.
-        # For iterables, use lists over tuples or `datasets.Sequence`
-
         if self.config.schema == "source":
             features = datasets.Features(
                 {
@@ -188,54 +163,53 @@ def _info(self) -> datasets.DatasetInfo:
     def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]:
         """Returns SplitGenerators."""
 
-        # Download gets entire git repo containing unused data from other datasets
-        # repo_dir = Path(dl_manager.download_and_extract(_URLS[0]))
-        # data_dir = repo_dir / "data"
-        # data_dir.mkdir(exist_ok=True)
-
-        # Find the relevant files from Verspor2013 and move them to a new directory
-        # thomas2011_files = repo_dir.glob("*/*/*thomas2011/**/*")
-        # for file in thomas2011_files:
-        #    if file.is_file() and "README" not in str(file):
-        #        file.rename(data_dir / file.name)
-
-        # Delete all unused files and directories from the original download
-        # for x in repo_dir.glob("[!data]*"):
-        #    if x.is_file():
-        #        x.unlink()
-        #    elif x.is_dir():
-        #        rmtree(x)
-
-        data_dir = dl_manager.download_and_extract(_URLS[self.config.schema])
+        data_dir = dl_manager.download_and_extract(_URLS[_DATASETNAME])
         return [
             datasets.SplitGenerator(
-                name=datasets.Split.TEST,
+                name=datasets.Split.TRAIN,
                 # Whatever you put in gen_kwargs will be passed to _generate_examples
                 gen_kwargs={
                     "filepath": os.path.join(data_dir, "annotations.txt"),
-                    "split": "test",
                 },
             )
         ]
 
-    # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
-
-    # TODO: change the args of this function to match the keys in `gen_kwargs`. You may add any necessary kwargs.
-    def _generate_examples(self, filepath: str, split: str) -> Tuple[int, Dict]:
+    def get_clean_pubmed_abstract(self, id):
+        url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?"
+        params = {
+            "db": "pubmed",
+            "id": id,
+            "retmode": "xml",
+            "rettype": "medline",
+        }
+        res = requests.get(url, params=params)
+        tree = ET.XML(res.text)
+        article = tree.find("PubmedArticle").find("MedlineCitation").find("Article")
+        article_title = article.find("ArticleTitle").text
+        abstract_parts = [f"{article_title}"]
+        article_abstract = article.find("Abstract").findall("AbstractText")
+        for abstract_part in article_abstract:
+            label = abstract_part.attrib.get("Label", "")
+            if label:
+                abstract_parts.append(f"{label}: {abstract_part.text}")
+            else:
+                abstract_parts.append(abstract_part.text)
+        return article_title, " ".join(abstract_parts)
+
+    def _generate_examples(self, filepath: str) -> Tuple[int, Dict]:
 
         """Yields examples as (key, example) tuples."""
-        if split == "test":
-            data_ann = []
-            with open(filepath, encoding="utf-8") as ann_tsv_file:
-                csv_reader_code = csv.reader(
-                    ann_tsv_file,
-                    quotechar="'",
-                    delimiter="\t",
-                    quoting=csv.QUOTE_ALL,
-                    skipinitialspace=True,
-                )
-                for id_, row in enumerate(csv_reader_code):
-                    data_ann.append(row)
+        data_ann = []
+        with open(filepath, encoding="utf-8") as ann_tsv_file:
+            csv_reader_code = csv.reader(
+                ann_tsv_file,
+                quotechar="'",
+                delimiter="\t",
+                quoting=csv.QUOTE_ALL,
+                skipinitialspace=True,
+            )
+            for id_, row in enumerate(csv_reader_code):
+                data_ann.append(row)
 
         if self.config.schema == "source":
             for id_, row in enumerate(data_ann):
@@ -259,20 +233,39 @@ def _generate_examples(self, filepath: str, split: str) -> Tuple[int, Dict]:
             ]
             df = pd.DataFrame(data_ann, columns=cols)
             uid = 0
+            curr_count = 0
             for id_ in df.doc_id.unique():
+                curr_count += 1
+                if curr_count == 3:
+                    # The PubMed API limits 3 requests per second without an API key
+                    time.sleep(0.5)
+                    curr_count = 0
                 elist = []
+                article_title, abstract_text = self.get_clean_pubmed_abstract(id_)
+                uid += 1
+                passage = {
+                    "id": uid,
+                    "type": "",
+                    "text": [abstract_text],
+                    "offsets": [[0, len(abstract_text)]],
+                }
+
                 for row in df.loc[df.doc_id == id_].itertuples():
                     uid += 1
                     if row.protein_or_nucleotide_sequence_mutation == "PSM":
                         ent_type = "Protein Sequence Mutation"
                     else:
                         ent_type = "Nucleotide Sequence Mutation"
+                    tag_start, tag_end = int(row.off1), int(row.off2)
+                    if tag_start > len(article_title):
+                        tag_start -= 1
+                        tag_end -= 1
                     elist.append(
                         {
                             "id": str(uid),
                             "type": ent_type,
                             "text": [row.covered_text],
-                            "offsets": [[int(row.off1), int(row.off2)]],
+                            "offsets": [[tag_start, tag_end]],
                             "normalized": [{"db_name": "dbSNP", "db_id": row.dbSNP_id}],
                         }
                     )
@@ -280,12 +273,8 @@ def _generate_examples(self, filepath: str, split: str) -> Tuple[int, Dict]:
                     "id": id_,  # uid is an unique identifier for every record that starts from 1
                     "document_id": str(row[0]),
                     "entities": elist,
-                    "passages": [],
+                    "passages": [passage],
                     "events": [],
                     "coreferences": [],
                     "relations": [],
                 }
-
-
-# This template is based on the following template from the datasets package:
-# https://github.com/huggingface/datasets/blob/master/templates/new_dataset_script.py