diff --git a/mteb/abstasks/AbsTaskRetrieval.py b/mteb/abstasks/AbsTaskRetrieval.py index 35eabc1fc..078979b6b 100644 --- a/mteb/abstasks/AbsTaskRetrieval.py +++ b/mteb/abstasks/AbsTaskRetrieval.py @@ -34,6 +34,7 @@ def __init__( qrels_file: str = "", streaming: bool = False, keep_in_memory: bool = False, + trust_remote_code: bool = False, ): self.corpus = {} self.queries = {} @@ -63,6 +64,7 @@ def __init__( self.qrels_file = qrels_file self.streaming = streaming self.keep_in_memory = keep_in_memory + self.trust_remote_code = trust_remote_code @staticmethod def check(fIn: str, ext: str): @@ -125,6 +127,7 @@ def _load_corpus(self): "corpus", keep_in_memory=self.keep_in_memory, streaming=self.streaming, + trust_remote_code=self.trust_remote_code, ) else: corpus_ds = load_dataset( @@ -152,6 +155,7 @@ def _load_queries(self): "queries", keep_in_memory=self.keep_in_memory, streaming=self.streaming, + trust_remote_code=self.trust_remote_code, ) else: queries_ds = load_dataset( @@ -174,6 +178,7 @@ def _load_qrels(self, split): self.hf_repo_qrels, keep_in_memory=self.keep_in_memory, streaming=self.streaming, + trust_remote_code=self.trust_remote_code, )[split] else: qrels_ds = load_dataset( @@ -254,6 +259,9 @@ def load_data(self, **kwargs): hf_repo_qrels=hf_repo_qrels, streaming=False, keep_in_memory=False, + trust_remote_code=self.metadata_dict["dataset"].get( + "trust_remote_code", False + ), ).load(split=split) # Conversion from DataSet queries = {query["id"]: query["text"] for query in queries} diff --git a/mteb/tasks/PairClassification/multilingual/XStance.py b/mteb/tasks/PairClassification/multilingual/XStance.py index 09c787fb9..515e59894 100644 --- a/mteb/tasks/PairClassification/multilingual/XStance.py +++ b/mteb/tasks/PairClassification/multilingual/XStance.py @@ -58,7 +58,7 @@ def load_data(self, **kwargs): self.dataset = {} path = self.metadata_dict["dataset"]["path"] revision = self.metadata_dict["dataset"]["revision"] - raw_dataset = load_dataset(path, revision=revision) + raw_dataset = load_dataset(path, revision=revision, trust_remote_code=True) def convert_example(example): return { diff --git a/mteb/tasks/Retrieval/multilingual/MultiLongDocRetrieval.py b/mteb/tasks/Retrieval/multilingual/MultiLongDocRetrieval.py index ded855889..025a34ef6 100644 --- a/mteb/tasks/Retrieval/multilingual/MultiLongDocRetrieval.py +++ b/mteb/tasks/Retrieval/multilingual/MultiLongDocRetrieval.py @@ -38,7 +38,11 @@ def load_mldr_data( for lang in langs: lang_corpus = datasets.load_dataset( - path, f"corpus-{lang}", cache_dir=cache_dir, revision=revision + path, + f"corpus-{lang}", + cache_dir=cache_dir, + revision=revision, + trust_remote_code=True, )["corpus"] lang_corpus = {e["docid"]: {"text": e["text"]} for e in lang_corpus} lang_data = datasets.load_dataset(path, lang, cache_dir=cache_dir) @@ -65,7 +69,6 @@ class MultiLongDocRetrieval(MultilingualTask, AbsTaskRetrieval): dataset={ "path": "Shitao/MLDR", "revision": "d67138e705d963e346253a80e59676ddb418810a", - "trust_remote_code": True, }, type="Retrieval", category="s2p", diff --git a/mteb/tasks/Retrieval/multilingual/XMarketRetrieval.py b/mteb/tasks/Retrieval/multilingual/XMarketRetrieval.py index d81722e04..01d240eb9 100644 --- a/mteb/tasks/Retrieval/multilingual/XMarketRetrieval.py +++ b/mteb/tasks/Retrieval/multilingual/XMarketRetrieval.py @@ -30,6 +30,7 @@ def _load_xmarket_data( languages=[lang], split=split, cache_dir=cache_dir, + trust_remote_code=True, ) query_rows = datasets.load_dataset( path, @@ -38,6 +39,7 @@ def _load_xmarket_data( revision=revision, split=split, cache_dir=cache_dir, + trust_remote_code=True, ) qrels_rows = datasets.load_dataset( path, @@ -46,6 +48,7 @@ def _load_xmarket_data( revision=revision, split=split, cache_dir=cache_dir, + trust_remote_code=True, ) corpus[lang][split] = {row["_id"]: row for row in corpus_rows} @@ -69,7 +72,6 @@ class XMarket(MultilingualTask, AbsTaskRetrieval): dataset={ "path": "jinaai/xmarket_ml", "revision": "dfe57acff5b62c23732a7b7d3e3fb84ff501708b", - "trust_remote_code": True, }, type="Retrieval", category="s2p",