diff --git a/evaluation/benchmarks/swe_bench/scripts/eval/summarize_outputs.py b/evaluation/benchmarks/swe_bench/scripts/eval/summarize_outputs.py index f08ee73eee8a..f2fc46095db4 100755 --- a/evaluation/benchmarks/swe_bench/scripts/eval/summarize_outputs.py +++ b/evaluation/benchmarks/swe_bench/scripts/eval/summarize_outputs.py @@ -3,11 +3,11 @@ import glob import json import os +import random from collections import Counter -import pandas as pd -import random import numpy as np +import pandas as pd from openhands.events.serialization import event_from_dict from openhands.events.utils import get_pairs_from_events @@ -20,12 +20,11 @@ ] -def get_bootstrap_accuracy_error_bars(values: float | int | bool, num_samples: int = 1000, p_value=0.05) -> tuple[float, float]: +def get_bootstrap_accuracy_error_bars( + values: float | int | bool, num_samples: int = 1000, p_value=0.05 +) -> tuple[float, float]: sorted_vals = np.sort( - [ - np.mean(random.sample(values, len(values) // 2)) - for _ in range(num_samples) - ] + [np.mean(random.sample(values, len(values) // 2)) for _ in range(num_samples)] ) bottom_idx = int(num_samples * p_value / 2) top_idx = int(num_samples * (1.0 - p_value / 2)) @@ -118,7 +117,9 @@ def process_file(file_path): 'resolved': { 'count': num_resolved, 'percentage': (num_resolved / num_lines * 100) if num_lines > 0 else 0, - 'ci': tuple(x * 100 for x in get_bootstrap_accuracy_error_bars(resolved_arr)), + 'ci': tuple( + x * 100 for x in get_bootstrap_accuracy_error_bars(resolved_arr) + ), }, 'empty_patches': { 'count': num_empty_patch, diff --git a/frontend/src/i18n/translation.json b/frontend/src/i18n/translation.json index 670a0e93f547..f7ee8f4959d6 100644 --- a/frontend/src/i18n/translation.json +++ b/frontend/src/i18n/translation.json @@ -2009,7 +2009,7 @@ "en": "Running a bash command" }, "ACTION_MESSAGE$RUN_IPYTHON": { - "en": "Running a Jupyter command" + "en": "Running a Python command" }, "ACTION_MESSAGE$READ": { "en": "Reading the contents of a file" @@ -2027,7 +2027,7 @@ "en": "Ran a bash command" }, "OBSERVATION_MESSAGE$RUN_IPYTHON": { - "en": "Ran a Jupyter command" + "en": "Ran a Python command" }, "OBSERVATION_MESSAGE$READ": { "en": "Read the contents of a file" diff --git a/pyproject.toml b/pyproject.toml index d97b9f8ee318..980a889c7141 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -100,6 +100,7 @@ reportlab = "*" [tool.coverage.run] concurrency = ["gevent"] + [tool.poetry.group.runtime.dependencies] jupyterlab = "*" notebook = "*" @@ -129,6 +130,7 @@ ignore = ["D1"] [tool.ruff.lint.pydocstyle] convention = "google" + [tool.poetry.group.evaluation.dependencies] streamlit = "*" whatthepatch = "*"