Skip to content

Commit

Permalink
Feat more arguments (#13)
Browse files Browse the repository at this point in the history
* feat: add semantic chunking to eval script; add wrapper for minilm

* fix: gaps in semantic chunking

* feat: add option to pass custom model for chunking

* feat: support nomic ai model

* feat: add additional cmd args

* feat: add arg for n_sentences
  • Loading branch information
guenthermi authored Sep 25, 2024
1 parent 9dee799 commit f631894
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 3 deletions.
19 changes: 19 additions & 0 deletions chunked_pooling/mteb_chunked_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ def __init__(
n_sentences: Optional[int] = None,
model_has_instructions: bool = False,
embedding_model_name: Optional[str] = None, # for semantic chunking
truncate_max_length: Optional[int] = 8192,
**kwargs,
):
super().__init__(**kwargs)
Expand All @@ -48,6 +49,7 @@ def __init__(
'n_sentences': n_sentences,
'embedding_model_name': embedding_model_name,
}
self.truncate_max_length = truncate_max_length

def load_data(self, **kwargs):
self.retrieval_task.load_data(**kwargs)
Expand Down Expand Up @@ -97,6 +99,21 @@ def evaluate(

return scores

def _truncate_documents(self, corpus):
for k, v in corpus.items():
if 'title' in v:
raise NotImplementedError(
'Currently truncation is only implemented for documents without titles'
)
tokens = self.tokenizer(
v['text'],
return_offsets_mapping=True,
max_length=self.truncate_max_length,
)
last_token_span = tokens.offset_mapping[-2]
v['text'] = v['text'][: last_token_span[1]]
return corpus

def _evaluate_monolingual(
self,
model,
Expand All @@ -108,6 +125,8 @@ def _evaluate_monolingual(
encode_kwargs=None,
**kwargs,
):
if self.truncate_max_length:
corpus = self._truncate_documents(corpus)
# split corpus into chunks
if not self.chunked_pooling_enabled:
corpus = self._apply_chunking(corpus, self.tokenizer)
Expand Down
35 changes: 32 additions & 3 deletions run_chunked_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,34 @@
required=False,
help='The name of the model used for semantic chunking.',
)
def main(model_name, strategy, task_name, eval_split, chunking_model):
@click.option(
'--truncate-max-length',
default=None,
type=int,
help='Maximum number of tokens; By default, no truncation is done.',
)
@click.option(
'--chunk-size',
default=DEFAULT_CHUNK_SIZE,
type=int,
help='Number of tokens per chunk for fixed strategy.',
)
@click.option(
'--n-sentences',
default=DEFAULT_N_SENTENCES,
type=int,
help='Number of sentences per chunk for sentence strategy.',
)
def main(
model_name,
strategy,
task_name,
eval_split,
chunking_model,
truncate_max_length,
chunk_size,
n_sentences,
):
try:
task_cls = globals()[task_name]
except:
Expand All @@ -46,8 +73,8 @@ def main(model_name, strategy, task_name, eval_split, chunking_model):
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

chunking_args = {
'chunk_size': DEFAULT_CHUNK_SIZE,
'n_sentences': DEFAULT_N_SENTENCES,
'chunk_size': chunk_size,
'n_sentences': n_sentences,
'chunking_strategy': strategy,
'model_has_instructions': has_instructions,
'embedding_model_name': chunking_model if chunking_model else model_name,
Expand All @@ -64,6 +91,7 @@ def main(model_name, strategy, task_name, eval_split, chunking_model):
chunked_pooling_enabled=True,
tokenizer=tokenizer,
prune_size=None,
truncate_max_length=truncate_max_length,
**chunking_args,
)
]
Expand All @@ -90,6 +118,7 @@ def main(model_name, strategy, task_name, eval_split, chunking_model):
chunked_pooling_enabled=False,
tokenizer=tokenizer,
prune_size=None,
truncate_max_length=truncate_max_length,
**chunking_args,
)
]
Expand Down

0 comments on commit f631894

Please sign in to comment.