Skip to content

Commit

Permalink
feat: update English benchmarks and mark MMTEB benchmarks as beta (#1341
Browse files Browse the repository at this point in the history
)

* feat: update English benchmarks and mark MMTEB benchmarks as beta

* Added summEvalv2

* Update docs with new MTEB_EN_MAIN rename
  • Loading branch information
KennethEnevoldsen authored Oct 28, 2024
1 parent 5c38eb8 commit 61371dd
Show file tree
Hide file tree
Showing 11 changed files with 87 additions and 20 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ For instance to select the 56 English datasets that form the "Overall MTEB Engli

```python
import mteb
benchmark = mteb.get_benchmark("MTEB(eng)")
benchmark = mteb.get_benchmark("MTEB(eng, classic)")
evaluation = mteb.MTEB(tasks=benchmark)
```

Expand Down
3 changes: 1 addition & 2 deletions docs/adding_a_model.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,7 @@ model = mteb.get_model("sentence-transformers/paraphrase-multilingual-MiniLM-L12

tasks = mteb.get_tasks(...) # get specific tasks
# or
from mteb.benchmarks import MTEB_MAIN_EN
tasks = MTEB_MAIN_EN # or use a specific benchmark
tasks = mteb.get_benchmark("MTEB(eng, classic)") # or use a specific benchmark

evaluation = mteb.MTEB(tasks=tasks)
evaluation.run(model, output_folder="results")
Expand Down
7 changes: 4 additions & 3 deletions mteb/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from importlib.metadata import version

from mteb.benchmarks.benchmarks import (
MTEB_MAIN_EN,
MTEB_ENG_CLASSIC,
MTEB_MAIN_RU,
MTEB_RETRIEVAL_LAW,
MTEB_RETRIEVAL_WITH_INSTRUCTIONS,
Expand All @@ -15,13 +15,13 @@
from mteb.overview import TASKS_REGISTRY, get_task, get_tasks

from .benchmarks.benchmarks import Benchmark
from .benchmarks.get_benchmark import get_benchmark, get_benchmarks
from .benchmarks.get_benchmark import BENCHMARK_REGISTRY, get_benchmark, get_benchmarks

__version__ = version("mteb") # fetch version from install metadata


__all__ = [
"MTEB_MAIN_EN",
"MTEB_ENG_CLASSIC",
"MTEB_MAIN_RU",
"MTEB_RETRIEVAL_LAW",
"MTEB_RETRIEVAL_WITH_INSTRUCTIONS",
Expand All @@ -37,4 +37,5 @@
"get_benchmark",
"get_benchmarks",
"BenchmarkResults",
"BENCHMARK_REGISTRY",
]
65 changes: 59 additions & 6 deletions mteb/benchmarks/benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,61 @@ def load_results(
return base_results.select_tasks(self.tasks)


MTEB_MAIN_EN = Benchmark(
name="MTEB(eng)",
MTEB_EN = Benchmark(
name="MTEB(eng, beta)",
tasks=get_tasks(
tasks=[
"AmazonCounterfactualClassification",
"ArguAna",
"ArXivHierarchicalClusteringP2P",
"ArXivHierarchicalClusteringS2S",
"AskUbuntuDupQuestions",
"BIOSSES",
"Banking77Classification",
"BiorxivClusteringP2P.v2",
"CQADupstackGamingRetrieval",
"CQADupstackUnixRetrieval",
"ClimateFEVERHardNegatives",
"FEVERHardNegatives",
"FiQA2018",
"HotpotQAHardNegatives",
"ImdbClassification",
"MTOPDomainClassification",
"MassiveIntentClassification",
"MassiveScenarioClassification",
"MedrxivClusteringP2P.v2",
"MedrxivClusteringS2S.v2",
"MindSmallReranking",
"SCIDOCS",
"SICK-R",
"STS12",
"STS13",
"STS14",
"STS15",
"STS17",
"STS22.v2",
"STSBenchmark",
"SprintDuplicateQuestions",
"StackExchangeClustering.v2",
"StackExchangeClusteringP2P.v2",
"TRECCOVID",
"Touche2020",
"ToxicConversationsClassification",
"TweetSentimentExtractionClassification",
"TwentyNewsgroupsClustering.v2",
"TwitterSemEval2015",
"TwitterURLCorpus",
"SummEvalSummarization.v2",
],
languages=["eng"],
eval_splits=["test"],
),
description="English benchmarks from MTEB",
citation="",
)

MTEB_ENG_CLASSIC = Benchmark(
name="MTEB(eng, classic)",
tasks=get_tasks(
tasks=[
"AmazonCounterfactualClassification",
Expand Down Expand Up @@ -137,7 +190,7 @@ def load_results(
languages=["eng"],
eval_splits=["test"],
),
description="Main English benchmarks from MTEB",
description="The original English benchmarks by Muennighoff et al., (2023).",
citation="""@inproceedings{muennighoff-etal-2023-mteb,
title = "{MTEB}: Massive Text Embedding Benchmark",
author = "Muennighoff, Niklas and
Expand Down Expand Up @@ -556,7 +609,7 @@ def load_results(


MTEB_multilingual = Benchmark(
name="MTEB(Multilingual)",
name="MTEB(Multilingual, beta)",
tasks=get_tasks(
tasks=[
"BornholmBitextMining",
Expand Down Expand Up @@ -734,7 +787,7 @@ def load_results(


MTEB_INDIC = Benchmark(
name="MTEB(indic)",
name="MTEB(Indic, beta)",
tasks=get_tasks(
tasks=[
# Bitext
Expand Down Expand Up @@ -776,7 +829,7 @@ def load_results(


MTEB_EU = Benchmark(
name="MTEB(Europe)",
name="MTEB(Europe, beta)",
tasks=get_tasks(
tasks=[
"BornholmBitextMining",
Expand Down
2 changes: 1 addition & 1 deletion mteb/leaderboard/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def load_results():

benchmarks = mteb.get_benchmarks()

default_benchmark = mteb.get_benchmark("MTEB(Multilingual)")
default_benchmark = mteb.get_benchmark("MTEB(Multilingual, beta)")
default_results = default_benchmark.load_results(base_results=all_results)

benchmark_select = gr.Dropdown(
Expand Down
5 changes: 4 additions & 1 deletion scripts/task_selection/task_selection_eng_lite.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,10 @@
}
],
"source": [
"from mteb.benchmarks import MTEB_MAIN_EN\n",
"import mteb\n",
"\n",
"MTEB_MAIN_EN = mteb.get_benchmark(\"MTEB(eng, classic)\")\n",
"\n",
"\n",
"tasks = MTEB_MAIN_EN.tasks\n",
"\n",
Expand Down
4 changes: 3 additions & 1 deletion scripts/task_selection/task_selection_eu.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -5590,7 +5590,9 @@
}
],
"source": [
"from mteb.benchmarks import MTEB_MAIN_EN\n",
"import mteb\n",
"\n",
"MTEB_MAIN_EN = mteb.get_benchmark(\"MTEB(eng, classic)\")\n",
"\n",
"exceptions = [\n",
" \"STS22.v2\",\n",
Expand Down
5 changes: 4 additions & 1 deletion scripts/task_selection/task_selection_mult.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -1083,7 +1083,10 @@
}
],
"source": [
"from mteb.benchmarks import MTEB_MAIN_EN\n",
"import mteb\n",
"\n",
"MTEB_MAIN_EN = mteb.get_benchmark(\"MTEB(eng, classic)\")\n",
"\n",
"\n",
"exceptions = [\n",
" \"STS16\",\n",
Expand Down
4 changes: 3 additions & 1 deletion tests/test_benchmark/test_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,9 @@ def test_benchmark_names_must_be_unique():
assert len(names) == len(set(names))


@pytest.mark.parametrize("name", ["MTEB(eng)", "MTEB(rus)", "MTEB(Scandinavian)"])
@pytest.mark.parametrize(
"name", ["MTEB(eng, classic)", "MTEB(rus)", "MTEB(Scandinavian)"]
)
def test_get_benchmark(name):
benchmark = mteb.get_benchmark(benchmark_name=name)
assert isinstance(benchmark, mteb.Benchmark)
Expand Down
4 changes: 2 additions & 2 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@ def test_available_benchmarks():
result = subprocess.run(command, shell=True, capture_output=True, text=True)
assert result.returncode == 0, "Command failed"
assert (
"MTEB(eng)" in result.stdout
), "Sample benchmark MTEB(eng) task not found in available benchmarks"
"MTEB(eng, classic)" in result.stdout
), "Sample benchmark MTEB(eng, classic) task not found in available benchmarks"


run_task_fixures = [
Expand Down
6 changes: 5 additions & 1 deletion tests/test_tasks/test_retrieval_abstask.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
from __future__ import annotations

from typing import TYPE_CHECKING

import pytest

from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval
from mteb.tasks.Retrieval.eng.NFCorpusRetrieval import NFCorpus

if TYPE_CHECKING:
from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval


@pytest.mark.parametrize("task", [NFCorpus()])
def test_abstask_calculate_metadata_metrics(task: AbsTaskRetrieval):
Expand Down

0 comments on commit 61371dd

Please sign in to comment.