From f6318943dfbebf59ae5fcab6ac24ca4919993697 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20G=C3=BCnther?= Date: Wed, 25 Sep 2024 13:33:32 +0200 Subject: [PATCH] Feat more arguments (#13) * feat: add semantic chunking to eval script; add wrapper for minilm * fix: gaps in semantic chunking * feat: add option to pass custom model for chunking * feat: support nomic ai model * feat: add additional cmd args * feat: add arg for n_sentences --- chunked_pooling/mteb_chunked_eval.py | 19 +++++++++++++++ run_chunked_eval.py | 35 +++++++++++++++++++++++++--- 2 files changed, 51 insertions(+), 3 deletions(-) diff --git a/chunked_pooling/mteb_chunked_eval.py b/chunked_pooling/mteb_chunked_eval.py index 2433e7f..b119deb 100644 --- a/chunked_pooling/mteb_chunked_eval.py +++ b/chunked_pooling/mteb_chunked_eval.py @@ -26,6 +26,7 @@ def __init__( n_sentences: Optional[int] = None, model_has_instructions: bool = False, embedding_model_name: Optional[str] = None, # for semantic chunking + truncate_max_length: Optional[int] = 8192, **kwargs, ): super().__init__(**kwargs) @@ -48,6 +49,7 @@ def __init__( 'n_sentences': n_sentences, 'embedding_model_name': embedding_model_name, } + self.truncate_max_length = truncate_max_length def load_data(self, **kwargs): self.retrieval_task.load_data(**kwargs) @@ -97,6 +99,21 @@ def evaluate( return scores + def _truncate_documents(self, corpus): + for k, v in corpus.items(): + if 'title' in v: + raise NotImplementedError( + 'Currently truncation is only implemented for documents without titles' + ) + tokens = self.tokenizer( + v['text'], + return_offsets_mapping=True, + max_length=self.truncate_max_length, + ) + last_token_span = tokens.offset_mapping[-2] + v['text'] = v['text'][: last_token_span[1]] + return corpus + def _evaluate_monolingual( self, model, @@ -108,6 +125,8 @@ def _evaluate_monolingual( encode_kwargs=None, **kwargs, ): + if self.truncate_max_length: + corpus = self._truncate_documents(corpus) # split corpus into chunks if not self.chunked_pooling_enabled: corpus = self._apply_chunking(corpus, self.tokenizer) diff --git a/run_chunked_eval.py b/run_chunked_eval.py index ff49da0..88494bd 100644 --- a/run_chunked_eval.py +++ b/run_chunked_eval.py @@ -35,7 +35,34 @@ required=False, help='The name of the model used for semantic chunking.', ) -def main(model_name, strategy, task_name, eval_split, chunking_model): +@click.option( + '--truncate-max-length', + default=None, + type=int, + help='Maximum number of tokens; By default, no truncation is done.', +) +@click.option( + '--chunk-size', + default=DEFAULT_CHUNK_SIZE, + type=int, + help='Number of tokens per chunk for fixed strategy.', +) +@click.option( + '--n-sentences', + default=DEFAULT_N_SENTENCES, + type=int, + help='Number of sentences per chunk for sentence strategy.', +) +def main( + model_name, + strategy, + task_name, + eval_split, + chunking_model, + truncate_max_length, + chunk_size, + n_sentences, +): try: task_cls = globals()[task_name] except: @@ -46,8 +73,8 @@ def main(model_name, strategy, task_name, eval_split, chunking_model): tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) chunking_args = { - 'chunk_size': DEFAULT_CHUNK_SIZE, - 'n_sentences': DEFAULT_N_SENTENCES, + 'chunk_size': chunk_size, + 'n_sentences': n_sentences, 'chunking_strategy': strategy, 'model_has_instructions': has_instructions, 'embedding_model_name': chunking_model if chunking_model else model_name, @@ -64,6 +91,7 @@ def main(model_name, strategy, task_name, eval_split, chunking_model): chunked_pooling_enabled=True, tokenizer=tokenizer, prune_size=None, + truncate_max_length=truncate_max_length, **chunking_args, ) ] @@ -90,6 +118,7 @@ def main(model_name, strategy, task_name, eval_split, chunking_model): chunked_pooling_enabled=False, tokenizer=tokenizer, prune_size=None, + truncate_max_length=truncate_max_length, **chunking_args, ) ]