diff --git a/mteb/descriptive_stats/Retrieval/AutoRAGRetrieval.json b/mteb/descriptive_stats/Retrieval/AutoRAGRetrieval.json new file mode 100644 index 000000000..62c9124a4 --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/AutoRAGRetrieval.json @@ -0,0 +1,11 @@ +{ + "test": { + "number_of_characters": 894.2168128654971, + "num_samples": 834, + "num_queries": 114, + "num_documents": 720, + "average_document_length": 1.1452816358024691, + "average_query_length": 0.610649430594029, + "average_relevant_docs_per_query": 1.0 + } +} \ No newline at end of file diff --git a/mteb/tasks/Retrieval/__init__.py b/mteb/tasks/Retrieval/__init__.py index c29a4383c..f8a47b08a 100644 --- a/mteb/tasks/Retrieval/__init__.py +++ b/mteb/tasks/Retrieval/__init__.py @@ -100,6 +100,7 @@ from .jpn.NLPJournalTitleAbsRetrieval import * from .jpn.NLPJournalTitleIntroRetrieval import * from .kat.GeorgianFAQRetrieval import * +from .kor.AutoRAGRetrieval import * from .kor.KoStrategyQA import * from .multilingual.BelebeleRetrieval import * from .multilingual.CrossLingualSemanticDiscriminationWMT19 import * diff --git a/mteb/tasks/Retrieval/kor/AutoRAGRetrieval.py b/mteb/tasks/Retrieval/kor/AutoRAGRetrieval.py new file mode 100644 index 000000000..4a24e04e9 --- /dev/null +++ b/mteb/tasks/Retrieval/kor/AutoRAGRetrieval.py @@ -0,0 +1,40 @@ +from __future__ import annotations + +from mteb.abstasks.TaskMetadata import TaskMetadata + +from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval + + +class AutoRAGRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="AutoRAGRetrieval", + description="This dataset enables the evaluation of Korean RAG performance across various domains—finance, public sector, healthcare, legal, and commerce—by providing publicly accessible documents, questions, and answers.", + reference="https://arxiv.org/abs/2410.20878", + dataset={ + "path": "yjoonjang/markers_bm", + "revision": "fd7df84ac089bbec763b1c6bb1b56e985df5cc5c", + }, + type="Retrieval", + prompt="Retrieve text based on user query.", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["kor-Hang"], + main_score="ndcg_at_10", + date=("2024-08-03", "2024-08-03"), + domains=["Government", "Medical", "Legal", "Social"], + task_subtypes=["Article retrieval"], + license="mit", + annotations_creators="human-annotated", + dialect=[], + sample_creation="created", + bibtex_citation="""@misc{kim2024autoragautomatedframeworkoptimization, + title={AutoRAG: Automated Framework for optimization of Retrieval Augmented Generation Pipeline}, + author={Dongkyu Kim and Byoungwook Kim and Donggeon Han and Matouš Eibich}, + year={2024}, + eprint={2410.20878}, + archivePrefix={arXiv}, + primaryClass={cs.CL}, + url={https://arxiv.org/abs/2410.20878}, +}""", + )