Skip to content

Commit

Permalink
Leaderboard: Fixed code benchmarks (#1441)
Browse files Browse the repository at this point in the history
* fixed code benchmarks

* fix: Made n_parameters formatting smarter and more robust

* fix: changed jina-embeddings-v3 number of parameters from 572K to 572M

* fix: Fixed use_instuctions typo in model overview

* fix: Fixed sentence-transformer compatibility switch

* Ran linting

* Added all languages, tasks, types and domains to options

* Removed resetting options when a new benchmark is selected

* All results now get displayed, but models that haven't been run on everything get nan values in the table
  • Loading branch information
x-tabdeveloping authored Nov 13, 2024
1 parent 76c2112 commit 3a1a470
Show file tree
Hide file tree
Showing 5 changed files with 68 additions and 37 deletions.
19 changes: 11 additions & 8 deletions mteb/leaderboard/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,28 +97,28 @@ def update_task_info(task_names: str) -> str:
info="Select one of our expert-selected benchmarks from MTEB publications.",
)
lang_select = gr.Dropdown(
default_results.languages,
all_results.languages,
value=default_results.languages,
multiselect=True,
label="Language",
info="Select languages to include.",
)
type_select = gr.Dropdown(
default_results.task_types,
all_results.task_types,
value=default_results.task_types,
multiselect=True,
label="Task Type",
info="Select task types to include.",
)
domain_select = gr.Dropdown(
default_results.domains,
all_results.domains,
value=default_results.domains,
multiselect=True,
label="Domain",
info="Select domains to include.",
)
task_select = gr.Dropdown(
default_results.task_names,
all_results.task_names,
value=default_results.task_names,
allow_custom_value=True,
multiselect=True,
Expand Down Expand Up @@ -191,7 +191,7 @@ def update_task_info(task_names: str) -> str:
[
(
"Should be sentence-transformers compatible",
"sbert_compatible",
"Sentence Transformers",
)
],
value=[],
Expand Down Expand Up @@ -239,10 +239,13 @@ def update_tables(scores, search_query: str):
def on_select_benchmark(benchmark_name):
benchmark = mteb.get_benchmark(benchmark_name)
benchmark_results = benchmark.load_results(base_results=all_results)
task_types = benchmark_results.task_types
langs = benchmark_results.languages
domains = benchmark_results.domains
return (
benchmark_results.languages,
benchmark_results.task_types,
benchmark_results.domains,
langs,
task_types,
domains,
)

@gr.on(
Expand Down
11 changes: 11 additions & 0 deletions mteb/leaderboard/figures.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,13 @@ def parse_model_name(name: str) -> str:
return name[1:]


def parse_float(value) -> float:
try:
return float(value)
except ValueError:
return np.nan


models_to_annotate = [
"all-MiniLM-L6-v2",
"GritLM-7B",
Expand All @@ -32,6 +39,10 @@ def performance_size_plot(df: pd.DataFrame) -> go.Figure:
df["Embedding Dimensions"] = df["Embedding Dimensions"].map(int)
df["Max Tokens"] = df["Max Tokens"].map(int)
df["Log(Tokens)"] = np.log10(df["Max Tokens"])
df["Mean (Task)"] = df["Mean (Task)"].map(parse_float)
df = df.dropna(subset=["Mean (Task)", "Number of Parameters"])
if not len(df.index):
return go.Figure()
min_score, max_score = df["Mean (Task)"].min(), df["Mean (Task)"].max()
fig = px.scatter(
df,
Expand Down
71 changes: 44 additions & 27 deletions mteb/leaderboard/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import math
import re
from collections import defaultdict

import gradio as gr
import numpy as np
Expand All @@ -26,17 +27,21 @@ def get_borda_rank(score_table: pd.DataFrame) -> pd.Series:


def format_scores(score: float) -> float:
return score * 100
return round(score * 100, 2)


def format_n_parameters(n_parameters) -> str:
if n_parameters is None:
if (n_parameters is None) or (not int(n_parameters)):
return ""
n_million = int(n_parameters) // 1e6
n_zeros = math.log10(n_million)
n_thousand = int(n_parameters // 1e3)
if n_thousand < 1:
return str(int(n_parameters))
n_zeros = math.log10(n_thousand)
if n_zeros >= 6:
return str(n_thousand // (10**6)) + "B"
if n_zeros >= 3:
return str(n_million // (10**3)) + "B"
return str(n_million) + "M"
return str(n_thousand // (10**3)) + "M"
return str(n_thousand) + "K"


def split_on_capital(s: str) -> str:
Expand Down Expand Up @@ -70,6 +75,29 @@ def get_column_types(df: pd.DataFrame) -> list[str]:
return types


def get_means_per_types(df: pd.DataFrame) -> pd.DataFrame:
task_names_per_type = defaultdict(list)
for task_name, task_type in zip(df["task_name"], df["task_type"]):
task_names_per_type[task_type].append(task_name)
groups = df.groupby(["model_name", "model_revision"])
records = []
for (model_name, model_revision), group_data in groups:
name_to_score = dict(zip(group_data["task_name"], group_data["score"]))
for task_type, task_names in task_names_per_type.items():
type_mean = np.mean(
[name_to_score.get(task_name, np.nan) for task_name in task_names]
)
records.append(
dict(
model_name=model_name,
model_revision=model_revision,
task_type=task_type,
score=type_mean,
)
)
return pd.DataFrame.from_records(records)


def scores_to_tables(
scores_long: list[dict], search_query: str | None = None
) -> tuple[gr.DataFrame, gr.DataFrame]:
Expand All @@ -79,16 +107,7 @@ def scores_to_tables(
data["task_type"] = data["task_name"].map(
lambda task_name: get_task(task_name).metadata.type
)
mean_per_type = (
data.groupby(["model_name", "model_revision", "task_type"])[["score"]]
.agg(np.nanmean)
.reset_index()
)
typed_mean = (
mean_per_type.groupby(["model_name", "model_revision"])[["score"]]
.agg(np.nanmean)
.rename(columns={"score": "mean_by_task_type"})
)
mean_per_type = get_means_per_types(data)
mean_per_type = mean_per_type.pivot(
index=["model_name", "model_revision"], columns="task_type", values="score"
)
Expand All @@ -98,20 +117,18 @@ def scores_to_tables(
per_task = data.pivot(
index=["model_name", "model_revision"], columns="task_name", values="score"
)
to_remove = per_task.isna().any(axis="columns")
to_remove = per_task.isna().all(axis="columns")
if search_query:
names = per_task.index.get_level_values("model_name")
names = pd.Series(names, index=per_task.index)
to_remove |= ~names.str.contains(search_query, regex=True)
overall_mean = (
data.groupby(["model_name", "model_revision"])[["score"]]
.agg(np.nanmean)
.rename(columns={"score": "mean"})
)
typed_mean = mean_per_type.mean(skipna=False, axis=1)
overall_mean = per_task.mean(skipna=False, axis=1)
joint_table = mean_per_type.copy()
per_task = per_task[~to_remove]
mean_per_type = mean_per_type[~to_remove]
overall_mean = overall_mean[~to_remove]
joint_table = overall_mean.join([typed_mean, mean_per_type])
joint_table = joint_table[~to_remove]
joint_table.insert(0, "mean", overall_mean)
joint_table.insert(1, "mean_by_task_type", typed_mean)
joint_table["borda_rank"] = get_borda_rank(per_task)
joint_table = joint_table.reset_index()
joint_table = joint_table.drop(columns=["model_revision"])
Expand All @@ -132,7 +149,7 @@ def scores_to_tables(
"Number of Parameters",
model_metas.map(lambda m: format_n_parameters(m.n_parameters)),
)
joint_table = joint_table.sort_values("mean", ascending=False)
joint_table = joint_table.sort_values("borda_rank", ascending=True)
# Removing HF organization from model
joint_table["model_name"] = joint_table["model_name"].map(
lambda name: name.split("/")[-1]
Expand Down Expand Up @@ -165,7 +182,7 @@ def scores_to_tables(
# setting model name column to markdown
column_types[1] = "markdown"
score_columns = ["Mean (Task)", "Mean (TaskType)", *mean_per_type.columns]
joint_table[score_columns] *= 100
joint_table[score_columns] = joint_table[score_columns].map(format_scores)
joint_table_style = (
joint_table.style.format(
{**{column: "{:.2f}" for column in score_columns}, "Rank (Borda)": "{:.0f}"}
Expand Down
2 changes: 1 addition & 1 deletion mteb/models/jina_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ def encode(
open_weights=True,
revision="215a6e121fa0183376388ac6b1ae230326bfeaed",
release_date="2024-09-18", # official release date
n_parameters=572_000,
n_parameters=572 * 1e6,
max_tokens=8194,
embed_dim=4096,
license="cc-by-nc-4.0",
Expand Down
2 changes: 1 addition & 1 deletion mteb/models/overview.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def get_model_metas(
if (frameworks is not None) and not (frameworks <= set(model_meta.framework)):
continue
if (use_instructions is not None) and (
model_meta.use_instuctions != use_instructions
model_meta.use_instructions != use_instructions
):
continue
lower, upper = n_parameters_range
Expand Down

0 comments on commit 3a1a470

Please sign in to comment.