diff --git a/mteb/leaderboard/app.py b/mteb/leaderboard/app.py index 045bdf4ca..9b89d5dd4 100644 --- a/mteb/leaderboard/app.py +++ b/mteb/leaderboard/app.py @@ -97,28 +97,28 @@ def update_task_info(task_names: str) -> str: info="Select one of our expert-selected benchmarks from MTEB publications.", ) lang_select = gr.Dropdown( - default_results.languages, + all_results.languages, value=default_results.languages, multiselect=True, label="Language", info="Select languages to include.", ) type_select = gr.Dropdown( - default_results.task_types, + all_results.task_types, value=default_results.task_types, multiselect=True, label="Task Type", info="Select task types to include.", ) domain_select = gr.Dropdown( - default_results.domains, + all_results.domains, value=default_results.domains, multiselect=True, label="Domain", info="Select domains to include.", ) task_select = gr.Dropdown( - default_results.task_names, + all_results.task_names, value=default_results.task_names, allow_custom_value=True, multiselect=True, @@ -191,7 +191,7 @@ def update_task_info(task_names: str) -> str: [ ( "Should be sentence-transformers compatible", - "sbert_compatible", + "Sentence Transformers", ) ], value=[], @@ -239,10 +239,13 @@ def update_tables(scores, search_query: str): def on_select_benchmark(benchmark_name): benchmark = mteb.get_benchmark(benchmark_name) benchmark_results = benchmark.load_results(base_results=all_results) + task_types = benchmark_results.task_types + langs = benchmark_results.languages + domains = benchmark_results.domains return ( - benchmark_results.languages, - benchmark_results.task_types, - benchmark_results.domains, + langs, + task_types, + domains, ) @gr.on( diff --git a/mteb/leaderboard/figures.py b/mteb/leaderboard/figures.py index 2810eb0a3..e8419d9a3 100644 --- a/mteb/leaderboard/figures.py +++ b/mteb/leaderboard/figures.py @@ -16,6 +16,13 @@ def parse_model_name(name: str) -> str: return name[1:] +def parse_float(value) -> float: + try: + return float(value) + except ValueError: + return np.nan + + models_to_annotate = [ "all-MiniLM-L6-v2", "GritLM-7B", @@ -32,6 +39,10 @@ def performance_size_plot(df: pd.DataFrame) -> go.Figure: df["Embedding Dimensions"] = df["Embedding Dimensions"].map(int) df["Max Tokens"] = df["Max Tokens"].map(int) df["Log(Tokens)"] = np.log10(df["Max Tokens"]) + df["Mean (Task)"] = df["Mean (Task)"].map(parse_float) + df = df.dropna(subset=["Mean (Task)", "Number of Parameters"]) + if not len(df.index): + return go.Figure() min_score, max_score = df["Mean (Task)"].min(), df["Mean (Task)"].max() fig = px.scatter( df, diff --git a/mteb/leaderboard/table.py b/mteb/leaderboard/table.py index bc8103077..034b33b4f 100644 --- a/mteb/leaderboard/table.py +++ b/mteb/leaderboard/table.py @@ -2,6 +2,7 @@ import math import re +from collections import defaultdict import gradio as gr import numpy as np @@ -26,17 +27,21 @@ def get_borda_rank(score_table: pd.DataFrame) -> pd.Series: def format_scores(score: float) -> float: - return score * 100 + return round(score * 100, 2) def format_n_parameters(n_parameters) -> str: - if n_parameters is None: + if (n_parameters is None) or (not int(n_parameters)): return "" - n_million = int(n_parameters) // 1e6 - n_zeros = math.log10(n_million) + n_thousand = int(n_parameters // 1e3) + if n_thousand < 1: + return str(int(n_parameters)) + n_zeros = math.log10(n_thousand) + if n_zeros >= 6: + return str(n_thousand // (10**6)) + "B" if n_zeros >= 3: - return str(n_million // (10**3)) + "B" - return str(n_million) + "M" + return str(n_thousand // (10**3)) + "M" + return str(n_thousand) + "K" def split_on_capital(s: str) -> str: @@ -70,6 +75,29 @@ def get_column_types(df: pd.DataFrame) -> list[str]: return types +def get_means_per_types(df: pd.DataFrame) -> pd.DataFrame: + task_names_per_type = defaultdict(list) + for task_name, task_type in zip(df["task_name"], df["task_type"]): + task_names_per_type[task_type].append(task_name) + groups = df.groupby(["model_name", "model_revision"]) + records = [] + for (model_name, model_revision), group_data in groups: + name_to_score = dict(zip(group_data["task_name"], group_data["score"])) + for task_type, task_names in task_names_per_type.items(): + type_mean = np.mean( + [name_to_score.get(task_name, np.nan) for task_name in task_names] + ) + records.append( + dict( + model_name=model_name, + model_revision=model_revision, + task_type=task_type, + score=type_mean, + ) + ) + return pd.DataFrame.from_records(records) + + def scores_to_tables( scores_long: list[dict], search_query: str | None = None ) -> tuple[gr.DataFrame, gr.DataFrame]: @@ -79,16 +107,7 @@ def scores_to_tables( data["task_type"] = data["task_name"].map( lambda task_name: get_task(task_name).metadata.type ) - mean_per_type = ( - data.groupby(["model_name", "model_revision", "task_type"])[["score"]] - .agg(np.nanmean) - .reset_index() - ) - typed_mean = ( - mean_per_type.groupby(["model_name", "model_revision"])[["score"]] - .agg(np.nanmean) - .rename(columns={"score": "mean_by_task_type"}) - ) + mean_per_type = get_means_per_types(data) mean_per_type = mean_per_type.pivot( index=["model_name", "model_revision"], columns="task_type", values="score" ) @@ -98,20 +117,18 @@ def scores_to_tables( per_task = data.pivot( index=["model_name", "model_revision"], columns="task_name", values="score" ) - to_remove = per_task.isna().any(axis="columns") + to_remove = per_task.isna().all(axis="columns") if search_query: names = per_task.index.get_level_values("model_name") names = pd.Series(names, index=per_task.index) to_remove |= ~names.str.contains(search_query, regex=True) - overall_mean = ( - data.groupby(["model_name", "model_revision"])[["score"]] - .agg(np.nanmean) - .rename(columns={"score": "mean"}) - ) + typed_mean = mean_per_type.mean(skipna=False, axis=1) + overall_mean = per_task.mean(skipna=False, axis=1) + joint_table = mean_per_type.copy() per_task = per_task[~to_remove] - mean_per_type = mean_per_type[~to_remove] - overall_mean = overall_mean[~to_remove] - joint_table = overall_mean.join([typed_mean, mean_per_type]) + joint_table = joint_table[~to_remove] + joint_table.insert(0, "mean", overall_mean) + joint_table.insert(1, "mean_by_task_type", typed_mean) joint_table["borda_rank"] = get_borda_rank(per_task) joint_table = joint_table.reset_index() joint_table = joint_table.drop(columns=["model_revision"]) @@ -132,7 +149,7 @@ def scores_to_tables( "Number of Parameters", model_metas.map(lambda m: format_n_parameters(m.n_parameters)), ) - joint_table = joint_table.sort_values("mean", ascending=False) + joint_table = joint_table.sort_values("borda_rank", ascending=True) # Removing HF organization from model joint_table["model_name"] = joint_table["model_name"].map( lambda name: name.split("/")[-1] @@ -165,7 +182,7 @@ def scores_to_tables( # setting model name column to markdown column_types[1] = "markdown" score_columns = ["Mean (Task)", "Mean (TaskType)", *mean_per_type.columns] - joint_table[score_columns] *= 100 + joint_table[score_columns] = joint_table[score_columns].map(format_scores) joint_table_style = ( joint_table.style.format( {**{column: "{:.2f}" for column in score_columns}, "Rank (Borda)": "{:.0f}"} diff --git a/mteb/models/jina_models.py b/mteb/models/jina_models.py index b103e174e..08eb6cb63 100644 --- a/mteb/models/jina_models.py +++ b/mteb/models/jina_models.py @@ -214,7 +214,7 @@ def encode( open_weights=True, revision="215a6e121fa0183376388ac6b1ae230326bfeaed", release_date="2024-09-18", # official release date - n_parameters=572_000, + n_parameters=572 * 1e6, max_tokens=8194, embed_dim=4096, license="cc-by-nc-4.0", diff --git a/mteb/models/overview.py b/mteb/models/overview.py index 1db236969..91b84e38d 100644 --- a/mteb/models/overview.py +++ b/mteb/models/overview.py @@ -96,7 +96,7 @@ def get_model_metas( if (frameworks is not None) and not (frameworks <= set(model_meta.framework)): continue if (use_instructions is not None) and ( - model_meta.use_instuctions != use_instructions + model_meta.use_instructions != use_instructions ): continue lower, upper = n_parameters_range