Skip to content

Commit

Permalink
feat: Standardize descriptive stats (#1375)
Browse files Browse the repository at this point in the history
* init

* mock tests

* remove debug prints

* remove descriptive stats and move to n_samples

* fix typo

* fix create task table

* fix citations

* remove n_samples

* metadata per task

* add test

* reformat task table

* add n_samples property

* fix tests

* rename total_symbols to number_of_characters

* Update mteb/abstasks/AbsTaskRetrieval.py

Co-authored-by: Kenneth Enevoldsen <[email protected]>

* rename in tests

* lint

---------

Co-authored-by: Kenneth Enevoldsen <[email protected]>
  • Loading branch information
Samoed and KennethEnevoldsen authored Nov 6, 2024
1 parent 76c772d commit 2854fa2
Show file tree
Hide file tree
Showing 489 changed files with 13,667 additions and 15,062 deletions.
7 changes: 3 additions & 4 deletions docs/adding_a_dataset.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,15 +37,14 @@ class SciDocsReranking(AbsTaskReranking):
dataset={
"path": "mteb/scidocs-reranking",
"revision": "d3c5e1fc0b855ab6097bf1cda04dd73947d7caab",
}
},
date=("2000-01-01", "2020-12-31"), # best guess
domains=["Academic", "Non-fiction", "Domains"],
task_subtypes=["Scientific Reranking"],
license="cc-by-4.0",
annotations_creators="derived",
dialect=[],
sample_creation="found",
descriptive_stats={"n_samples": {"test": 19599}, "avg_character_length": {"test": 69.0}},
bibtex_citation="""
@inproceedings{cohan-etal-2020-specter,
title = "{SPECTER}: Document-level Representation Learning using Citation-informed Transformers",
Expand Down Expand Up @@ -73,7 +72,7 @@ class SciDocsReranking(AbsTaskReranking):

# testing the task with a model:
model = SentenceTransformer("average_word_embeddings_komninos")
evaluation = MTEB(tasks=[MindSmallReranking()])
evaluation = MTEB(tasks=[SciDocsReranking()])
evaluation.run(model)
```

Expand Down Expand Up @@ -109,7 +108,7 @@ class VGClustering(AbsTaskClustering):
dialect=[],
text_creation="found",
bibtex_citation= ... # removed for brevity
)
)

def dataset_transform(self):
splits = self.description["eval_splits"]
Expand Down
27 changes: 13 additions & 14 deletions docs/create_tasks_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,17 @@ def author_from_bibtex(bibtex: str | None) -> str:
return f" ({author_str_w_et_al}, {year_str})"


def round_floats_in_dict(d: dict, precision: int = 2) -> dict:
if not isinstance(d, dict):
return d
for key, value in d.items():
if isinstance(value, float):
d[key] = round(value, precision)
elif isinstance(value, dict):
d[key] = round_floats_in_dict(value, precision)
return d


def task_to_markdown_row(task: mteb.AbsTask) -> str:
name = task.metadata.name
name_w_reference = (
Expand All @@ -40,20 +51,8 @@ def task_to_markdown_row(task: mteb.AbsTask) -> str:
domains = (
"[" + ", ".join(task.metadata.domains) + "]" if task.metadata.domains else ""
)
n_samples = (
task.metadata.descriptive_stats["n_samples"]
if "n_samples" in task.metadata.descriptive_stats
else ""
)
dataset_statistics = ""
if "avg_character_length" in task.metadata.descriptive_stats:
dataset_statistics = task.metadata.descriptive_stats["avg_character_length"]
elif len(task.metadata.descriptive_stats) > 1:
all_stat = task.metadata.descriptive_stats
all_stat.pop("n_samples")
if len(all_stat) > 0:
dataset_statistics = all_stat

n_samples = task.metadata.n_samples
dataset_statistics = round_floats_in_dict(task.metadata.descriptive_stats)
name_w_reference += author_from_bibtex(task.metadata.bibtex_citation)

return f"| {name_w_reference} | {task.metadata.languages} | {task.metadata.type} | {task.metadata.category} | {domains} | {n_samples} | {dataset_statistics} |"
Expand Down
51 changes: 29 additions & 22 deletions mteb/abstasks/AbsTask.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
from __future__ import annotations

import json
import logging
import random
from abc import ABC, abstractmethod
from collections.abc import Sequence
from typing import Any, TypedDict
from typing import Any

import datasets
import numpy as np
Expand All @@ -14,7 +15,7 @@
from sklearn.preprocessing import MultiLabelBinarizer

from mteb.abstasks.stratification import _iterative_train_test_split
from mteb.abstasks.TaskMetadata import HFSubset, TaskMetadata
from mteb.abstasks.TaskMetadata import DescriptiveStatistics, HFSubset, TaskMetadata
from mteb.encoder_interface import Encoder
from mteb.languages import LanguageScripts

Expand Down Expand Up @@ -53,12 +54,6 @@ def _multilabel_subsampling(
return dataset_dict


class DescriptiveStatistics(TypedDict):
"""Class for descriptive statistics."""

pass


class AbsTask(ABC):
metadata: TaskMetadata
abstask_prompt: str | None = None
Expand Down Expand Up @@ -195,38 +190,50 @@ def load_data(self, **kwargs):
self.data_loaded = True

def calculate_metadata_metrics(
self,
self, overwrite_results: bool = False
) -> dict[str, DescriptiveStatistics | dict[str, DescriptiveStatistics]]:
if self.metadata.descriptive_stat_path.exists() and not overwrite_results:
logger.info("Loading metadata descriptive statistics from cache.")
return self.metadata.descriptive_stats

self.load_data()

all_details = {}
pbar_split = tqdm.tqdm(
self.metadata_dict["eval_splits"], desc="Processing Splits..."
)
descriptive_stats = {}
hf_subset_stat = "hf_subset_descriptive_stats"
pbar_split = tqdm.tqdm(self.metadata.eval_splits, desc="Processing Splits...")
for split in pbar_split:
pbar_split.set_postfix_str(f"Split: {split}")
print(f"Processing metadata for split {split}")
logger.info(f"Processing metadata for split {split}")
if self.is_multilingual:
all_details[split] = self._calculate_metrics_from_split(
descriptive_stats[split] = self._calculate_metrics_from_split(
split, compute_overall=True
)
all_details[split]["hf_subset_descriptive_stats"] = {}
descriptive_stats[split][hf_subset_stat] = {}

eval_langs = (
list(self.metadata.eval_langs.keys())
if isinstance(self.metadata.eval_langs, dict)
else self.metadata.eval_langs
)
if self.metadata.type == "Classification":
eval_langs += ["train"]

pbar_subsets = tqdm.tqdm(
self.metadata.eval_langs, desc="Processing Languages..."
)
for hf_subset in pbar_subsets:
pbar_subsets.set_postfix_str(f"Language: {hf_subset}")
print(f"Processing metadata for language {hf_subset}")
logger.info(f"Processing metadata for language {hf_subset}")
split_details = self._calculate_metrics_from_split(split, hf_subset)
all_details[split]["hf_subset_descriptive_stats"][hf_subset] = (
split_details
)
descriptive_stats[split][hf_subset_stat][hf_subset] = split_details
else:
split_details = self._calculate_metrics_from_split(split)
all_details[split] = split_details
descriptive_stats[split] = split_details

with self.metadata.descriptive_stat_path.open("w") as f:
json.dump(descriptive_stats, f, indent=4)

return all_details
return descriptive_stats

@abstractmethod
def _calculate_metrics_from_split(
Expand Down
6 changes: 5 additions & 1 deletion mteb/abstasks/AbsTaskBitextMining.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@

from ..evaluation.evaluators import BitextMiningEvaluator
from ..load_results.task_results import HFSubset, ScoresDict
from .AbsTask import AbsTask, DescriptiveStatistics
from .AbsTask import AbsTask
from .TaskMetadata import DescriptiveStatistics

logger = logging.getLogger(__name__)

Expand All @@ -19,11 +20,13 @@ class BitextDescriptiveStatistics(DescriptiveStatistics):
Attributes:
num_samples: number of samples in the dataset.
number_of_characters: Total number of symbols in the dataset.
average_sentence1_length: Average length of sentence1
average_sentence2_length: Average length of sentence2
"""

num_samples: int
number_of_characters: int
average_sentence1_length: float
average_sentence2_length: float

Expand Down Expand Up @@ -157,4 +160,5 @@ def _calculate_metrics_from_split(
average_sentence1_length=total_s1_len / len(sentence1),
average_sentence2_length=total_s2_len / len(sentence2),
num_samples=len(sentence1),
number_of_characters=total_s1_len + total_s2_len,
)
45 changes: 5 additions & 40 deletions mteb/abstasks/AbsTaskClassification.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from typing import Any

import numpy as np
import tqdm

from mteb.encoder_interface import Encoder

Expand All @@ -15,7 +14,8 @@
logRegClassificationEvaluator,
)
from ..load_results.task_results import HFSubset, ScoresDict
from .AbsTask import AbsTask, DescriptiveStatistics
from .AbsTask import AbsTask
from .TaskMetadata import DescriptiveStatistics

logger = logging.getLogger(__name__)

Expand All @@ -25,12 +25,14 @@ class ClassificationDescriptiveStatistics(DescriptiveStatistics):
Attributes:
num_samples: number of samples in the dataset.
number_of_characters: Total number of symbols in the dataset.
average_text_length: Average length of text
unique_labels: Number of unique labels
labels: dict of label frequencies
"""

num_samples: int
number_of_characters: int
average_text_length: float
unique_labels: int
labels: dict[str, dict[str, int]]
Expand Down Expand Up @@ -201,44 +203,6 @@ def _undersample_data(self, X, y, samples_per_label: int, idxs=None):
label_counter[y[i]] += 1
return X_sampled, y_sampled, idxs

def calculate_metadata_metrics(
self,
) -> dict[
str,
ClassificationDescriptiveStatistics
| dict[str, ClassificationDescriptiveStatistics],
]:
self.load_data()

# same function from parent class, but added explicitly train to splits

all_details = {}
pbar_split = tqdm.tqdm(
self.metadata.eval_splits + ["train"], desc="Processing Splits..."
)
for split in pbar_split:
pbar_split.set_postfix_str(f"Split: {split}")
logger.info(f"Processing metadata for split {split}")
if self.is_multilingual:
all_details[split] = self._calculate_metrics_from_split(
split, compute_overall=True
)
all_details[split]["hf_subset_descriptive_stats"] = {}

pbar_subset = tqdm.tqdm(
self.metadata.eval_langs, desc="Processing Languages..."
)
for hf_subset in pbar_subset:
pbar_subset.set_postfix_str(f"Language: {hf_subset}")
logger.info(f"Processing metadata for language {hf_subset}")
split_details = self._calculate_metrics_from_split(split, hf_subset)
all_details[split][hf_subset] = split_details
else:
split_details = self._calculate_metrics_from_split(split)
all_details[split] = split_details

return all_details

def _calculate_metrics_from_split(
self, split: str, hf_subset: str | None = None, compute_overall: bool = False
) -> ClassificationDescriptiveStatistics:
Expand All @@ -259,6 +223,7 @@ def _calculate_metrics_from_split(
label_count = Counter(label)
return ClassificationDescriptiveStatistics(
num_samples=len(text),
number_of_characters=total_text_len,
average_text_length=total_text_len / len(text),
unique_labels=len(label_count),
labels={
Expand Down
6 changes: 5 additions & 1 deletion mteb/abstasks/AbsTaskClustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
from mteb.load_results.task_results import ScoresDict

from ..evaluation.evaluators import ClusteringEvaluator
from .AbsTask import AbsTask, DescriptiveStatistics
from .AbsTask import AbsTask
from .TaskMetadata import DescriptiveStatistics

logger = logging.getLogger(__name__)

Expand All @@ -22,13 +23,15 @@ class ClusteringDescriptiveStatistics(DescriptiveStatistics):
Attributes:
num_samples: number of samples in the dataset.
number_of_characters: Total number of symbols in the dataset.
average_text_length: Average length of text
average_labels_per_text: Average number of labels per text
unique_labels: Number of unique labels
labels: dict of label frequencies
"""

num_samples: int
number_of_characters: int
average_text_length: float
average_labels_per_text: float
unique_labels: int
Expand Down Expand Up @@ -103,6 +106,7 @@ def _calculate_metrics_from_split(
label_counter = Counter(total_labels)
return ClusteringDescriptiveStatistics(
num_samples=len(sentences),
number_of_characters=total_text_len,
average_text_length=total_text_len / len(sentences),
average_labels_per_text=len(total_labels) / len(sentences),
unique_labels=len(label_counter),
Expand Down
6 changes: 5 additions & 1 deletion mteb/abstasks/AbsTaskClusteringFast.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@
from mteb.encoder_interface import Encoder

from ..load_results.task_results import HFSubset
from .AbsTask import AbsTask, DescriptiveStatistics
from .AbsTask import AbsTask
from .TaskMetadata import DescriptiveStatistics

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -83,13 +84,15 @@ class ClusteringFastDescriptiveStatistics(DescriptiveStatistics):
Attributes:
num_samples: number of samples in the dataset.
number_of_characters: Total number of symbols in the dataset.
average_text_length: Average length of text
average_labels_per_text: Average number of labels per text
unique_labels: Number of unique labels
labels: dict of label frequencies
"""

num_samples: int
number_of_characters: int
average_text_length: float
average_labels_per_text: float
unique_labels: int
Expand Down Expand Up @@ -233,6 +236,7 @@ def _calculate_metrics_from_split(
label_counter = Counter(total_labels)
return ClusteringFastDescriptiveStatistics(
num_samples=len(sentences),
number_of_characters=total_text_len,
average_text_length=total_text_len / len(sentences),
average_labels_per_text=len(total_labels) / len(sentences),
unique_labels=len(label_counter),
Expand Down
Loading

0 comments on commit 2854fa2

Please sign in to comment.