From bfeb7c3df4cef3ed526c85ef7708a5112dc5495e Mon Sep 17 00:00:00 2001 From: MahmoudAshraf97 Date: Wed, 13 Nov 2024 14:34:51 +0200 Subject: [PATCH 1/8] initial cleanup --- faster_whisper/transcribe.py | 139 ++++++++++++++++------------------- 1 file changed, 63 insertions(+), 76 deletions(-) diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py index d3d2bdf7..6f4b67b0 100644 --- a/faster_whisper/transcribe.py +++ b/faster_whisper/transcribe.py @@ -68,7 +68,6 @@ def _asdict(self): return asdict(self) -# Added additional parameters for multilingual videos and fixes below @dataclass class TranscriptionOptions: beam_size: int @@ -112,18 +111,7 @@ class TranscriptionInfo: vad_options: VadOptions -# The code below is originally from HF pipeline and is used in whisper-x -# (https://github.com/m-bain/whisperX) and adapted for faster_whisper - - class BatchedInferencePipeline: - """ - Huggingface Pipeline wrapper for WhisperModel. - Copyright (c) 2022, Max Bain - All rights reserved. - Modified by Mobius Labs GmbH - """ - def __init__( self, model, @@ -137,9 +125,9 @@ def __init__( self.preset_language = language self.last_speech_timestamp = 0.0 - def forward(self, features, chunks_metadata, **forward_params): + def forward(self, features, chunks_metadata, options): encoder_output, outputs = self.model.generate_segment_batched( - features, self.tokenizer, forward_params + features, self.tokenizer, options ) segmented_outputs = [] @@ -179,14 +167,14 @@ def forward(self, features, chunks_metadata, **forward_params): for subsegment in subsegments ] ) - if forward_params["word_timestamps"]: + if options.word_timestamps: self.last_speech_timestamp = self.model.add_word_timestamps( segmented_outputs, self.tokenizer, encoder_output, segment_sizes, - forward_params["prepend_punctuations"], - forward_params["append_punctuations"], + options.prepend_punctuations, + options.append_punctuations, self.last_speech_timestamp, ) @@ -229,7 +217,7 @@ def transcribe( max_new_tokens: Optional[int] = None, chunk_length: Optional[int] = None, clip_timestamps: Optional[List[dict]] = None, - batch_size: int = 16, + batch_size: int = 8, hotwords: Optional[str] = None, language_detection_threshold: Optional[float] = 0.5, language_detection_segments: int = 1, @@ -250,22 +238,11 @@ def transcribe( repetition_penalty: Penalty applied to the score of previously generated tokens (set > 1 to penalize). no_repeat_ngram_size: Prevent repetitions of ngrams with this size (set 0 to disable). - temperature: Temperature for sampling. It can be a tuple of temperatures, - which will be successively used upon failures according to either - `compression_ratio_threshold` or `log_prob_threshold`. - compression_ratio_threshold: If the gzip compression ratio is above this value, - treat as failed. - log_prob_threshold: If the average log probability over sampled tokens is - below this value, treat as failed. - log_prob_low_threshold: This parameter alone is sufficient to skip an output text, - whereas log_prob_threshold also looks for appropriate no_speech_threshold value. - This value should be less than log_prob_threshold. - no_speech_threshold: If the no_speech probability is higher than this value AND - the average log probability over sampled tokens is below `log_prob_threshold`, - consider the segment as silent. + temperature: Temperature for sampling. If a list or tuple is passed, + only the first value is used. initial_prompt: Optional text string or iterable of token ids to provide as a - prompt for the first window. - prefix: Optional text to provide as a prefix for the first window. + prompt for the each window. + prefix: Optional text to provide as a prefix at the beginning of each window. suppress_blank: Suppress blank outputs at the beginning of the sampling. suppress_tokens: List of token IDs to suppress. -1 will suppress a default set of symbols as defined in `tokenizer.non_speech_tokens()`. @@ -296,28 +273,34 @@ def transcribe( higher than this value, the language is detected. language_detection_segments: Number of segments to consider for the language detection. - Static params: (Fixed for batched version) + Unused Arguments max_initial_timestamp: The initial timestamp cannot be later than this, set at 0.0. multilingual: If True, perform transcription on multilingual videos. Set as False. output_language: Valid only if multilingual is set to True. Specifies the string representing the output language. One of 'en' (English) or 'hybrid' (code-switched transcription). set as None. + language_detection_threshold: If the maximum probability of the language tokens is + higher than this value, the language is detected. + language_detection_segments: Number of segments to consider for the language detection. + compression_ratio_threshold: If the gzip compression ratio is above this value, + treat as failed. + log_prob_threshold: If the average log probability over sampled tokens is + below this value, treat as failed. + log_prob_low_threshold: This parameter alone is sufficient to skip an output text, + whereas log_prob_threshold also looks for appropriate no_speech_threshold value. + This value should be less than log_prob_threshold. + no_speech_threshold: If the no_speech probability is higher than this value AND + the average log probability over sampled tokens is below `log_prob_threshold`, + consider the segment as silent. + hallucination_silence_threshold: Optional[float] + When word_timestamps is True, skip silent periods longer than this threshold + (in seconds) when a possible hallucination is detected. set as None. condition_on_previous_text: If True, the previous output of the model is provided as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop, such as repetition looping or timestamps going out of sync. Set as False prompt_reset_on_temperature: Resets prompt if temperature is above this value. Arg has effect only if condition_on_previous_text is True. Set at 0.5 - #TODO: support "hallucination_silence_threshold" when "word_timestamps=True" - hallucination_silence_threshold: Optional[float] - When word_timestamps is True, skip silent periods longer than this threshold - (in seconds) when a possible hallucination is detected. set as None. - - unused: - language_detection_threshold: If the maximum probability of the language tokens is - higher than this value, the language is detected. - language_detection_segments: Number of segments to consider for the language detection. - Returns: A tuple with: @@ -421,8 +404,7 @@ def transcribe( np.stack([pad_or_trim(feature) for feature in features]) if features else [] ) - # batched options: see the difference with default options in WhisperModel - batched_options = TranscriptionOptions( + options = TranscriptionOptions( beam_size=beam_size, best_of=best_of, patience=patience, @@ -434,7 +416,9 @@ def transcribe( no_speech_threshold=no_speech_threshold, compression_ratio_threshold=compression_ratio_threshold, temperatures=( - temperature if isinstance(temperature, (list, tuple)) else [temperature] + temperature[:1] + if isinstance(temperature, (list, tuple)) + else [temperature] ), initial_prompt=initial_prompt, prefix=prefix, @@ -447,7 +431,7 @@ def transcribe( word_timestamps=word_timestamps, hallucination_silence_threshold=None, condition_on_previous_text=False, - clip_timestamps="0", + clip_timestamps=clip_timestamps, prompt_reset_on_temperature=0.5, multilingual=False, output_language=None, @@ -460,7 +444,7 @@ def transcribe( language_probability=language_probability, duration=duration, duration_after_vad=duration_after_vad, - transcription_options=batched_options, + transcription_options=options, vad_options=None, all_language_probs=all_language_probs, ) @@ -469,7 +453,7 @@ def transcribe( features, chunks_metadata, batch_size, - batched_options, + options, log_progress, ) @@ -484,7 +468,7 @@ def _batched_segments_generator( results = self.forward( features[i : i + batch_size], chunks_metadata[i : i + batch_size], - **asdict(options), + options, ) for result in results: @@ -1693,50 +1677,53 @@ def generate_segment_batched( self, features: np.ndarray, tokenizer: Tokenizer, - options: dict, + options: TranscriptionOptions, ): batch_size = features.shape[0] - all_tokens = [] - prompt_reset_since = 0 - if options["initial_prompt"] is not None: - initial_prompt = " " + options["initial_prompt"].strip() - initial_prompt_tokens = tokenizer.encode(initial_prompt) - all_tokens.extend(initial_prompt_tokens) - previous_tokens = all_tokens[prompt_reset_since:] prompt = self.get_prompt( tokenizer, - previous_tokens, - without_timestamps=options["without_timestamps"], - prefix=options["prefix"], + previous_tokens=( + tokenizer.encode(options.initial_prompt) + if options.initial_prompt is not None + else [] + ), + without_timestamps=options.without_timestamps, + prefix=options.prefix, + hotwords=options.hotwords, ) encoder_output = self.encode(features) - result = self.model.generate( + results = self.model.generate( encoder_output, [prompt] * batch_size, - beam_size=options["beam_size"], - patience=options["patience"], - length_penalty=options["length_penalty"], + beam_size=options.beam_size, + patience=options.patience, + length_penalty=options.length_penalty, max_length=self.max_length, - suppress_blank=options["suppress_blank"], - suppress_tokens=options["suppress_tokens"], + suppress_blank=options.suppress_blank, + suppress_tokens=options.suppress_tokens, return_scores=True, return_no_speech_prob=True, + sampling_temperature=options.temperatures[0], + repetition_penalty=options.repetition_penalty, + no_repeat_ngram_size=options.no_repeat_ngram_size, ) output = [] - for res in result: - output.append({}) + for result in results: # return scores - seq_len = len(res.sequences_ids[0]) - cum_logprob = res.scores[0] * (seq_len ** options["length_penalty"]) - output[-1]["avg_logprob"] = cum_logprob / (seq_len + 1) + seq_len = len(result.sequences_ids[0]) + cum_logprob = result.scores[0] * (seq_len**options.length_penalty) - # return no speech prob - output[-1]["no_speech_prob"] = res.no_speech_prob - output[-1]["tokens"] = res.sequences_ids[0] + output.append( + dict( + avg_logprob=cum_logprob / (seq_len + 1), + no_speech_prob=result.no_speech_prob, + tokens=result.sequences_ids[0], + ) + ) return encoder_output, output From 17b0be0be21e5cddd69190cf63a39870b00c03ae Mon Sep 17 00:00:00 2001 From: MahmoudAshraf97 Date: Wed, 13 Nov 2024 17:39:38 +0200 Subject: [PATCH 2/8] . --- faster_whisper/transcribe.py | 101 ++++++++++++++++++++++++++++------- 1 file changed, 82 insertions(+), 19 deletions(-) diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py index 6f4b67b0..17de4584 100644 --- a/faster_whisper/transcribe.py +++ b/faster_whisper/transcribe.py @@ -115,19 +115,83 @@ class BatchedInferencePipeline: def __init__( self, model, - options: Optional[TranscriptionOptions] = None, - tokenizer=None, - language: Optional[str] = None, ): self.model: WhisperModel = model - self.tokenizer = tokenizer - self.options = options - self.preset_language = language self.last_speech_timestamp = 0.0 - def forward(self, features, chunks_metadata, options): - encoder_output, outputs = self.model.generate_segment_batched( - features, self.tokenizer, options + def generate_segment_batched( + self, + features: np.ndarray, + tokenizer: Tokenizer, + options: TranscriptionOptions, + ): + batch_size = features.shape[0] + + prompt = self.model.get_prompt( + tokenizer, + previous_tokens=( + tokenizer.encode(options.initial_prompt) + if options.initial_prompt is not None + else [] + ), + without_timestamps=options.without_timestamps, + prefix=options.prefix, + hotwords=options.hotwords, + ) + + if options.max_new_tokens is not None: + max_length = len(prompt) + options.max_new_tokens + else: + max_length = self.model.max_length + + if max_length > self.model.max_length: + raise ValueError( + f"The length of the prompt is {len(prompt)}, and the `max_new_tokens` " + f"{max_length - len(prompt)}. Thus, the combined length of the prompt " + f"and `max_new_tokens` is: {max_length}. This exceeds the " + f"`max_length` of the Whisper model: {self.model.max_length}. " + "You should either reduce the length of your prompt, or " + "reduce the value of `max_new_tokens`, " + f"so that their combined length is less that {self.model.max_length}." + ) + + encoder_output = self.model.encode(features) + + results = self.model.model.generate( + encoder_output, + [prompt] * batch_size, + beam_size=options.beam_size, + patience=options.patience, + length_penalty=options.length_penalty, + max_length=max_length, + suppress_blank=options.suppress_blank, + suppress_tokens=options.suppress_tokens, + return_scores=True, + return_no_speech_prob=True, + sampling_temperature=options.temperatures[0], + repetition_penalty=options.repetition_penalty, + no_repeat_ngram_size=options.no_repeat_ngram_size, + ) + + output = [] + for result in results: + # return scores + seq_len = len(result.sequences_ids[0]) + cum_logprob = result.scores[0] * (seq_len**options.length_penalty) + + output.append( + dict( + avg_logprob=cum_logprob / (seq_len + 1), + no_speech_prob=result.no_speech_prob, + tokens=result.sequences_ids[0], + ) + ) + + return encoder_output, output + + def forward(self, features, tokenizer, chunks_metadata, options): + encoder_output, outputs = self.generate_segment_batched( + features, tokenizer, options ) segmented_outputs = [] @@ -141,7 +205,7 @@ def forward(self, features, chunks_metadata, options): seek, single_timestamp_ending, ) = self.model._split_segments_by_timestamps( - tokenizer=self.tokenizer, + tokenizer=tokenizer, tokens=output["tokens"], time_offset=chunk_metadata["start_time"], segment_size=segment_size, @@ -151,14 +215,14 @@ def forward(self, features, chunks_metadata, options): segmented_outputs.append( [ dict( - text=self.tokenizer.decode(subsegment["tokens"]), + text=tokenizer.decode(subsegment["tokens"]), avg_logprob=output["avg_logprob"], no_speech_prob=output["no_speech_prob"], tokens=subsegment["tokens"], start=subsegment["start"], end=subsegment["end"], compression_ratio=get_compression_ratio( - self.tokenizer.decode(subsegment["tokens"]) + tokenizer.decode(subsegment["tokens"]) ), seek=int( chunk_metadata["start_time"] * self.model.frames_per_second @@ -170,7 +234,7 @@ def forward(self, features, chunks_metadata, options): if options.word_timestamps: self.last_speech_timestamp = self.model.add_word_timestamps( segmented_outputs, - self.tokenizer, + tokenizer, encoder_output, segment_sizes, options.prepend_punctuations, @@ -279,9 +343,6 @@ def transcribe( output_language: Valid only if multilingual is set to True. Specifies the string representing the output language. One of 'en' (English) or 'hybrid' (code-switched transcription). set as None. - language_detection_threshold: If the maximum probability of the language tokens is - higher than this value, the language is detected. - language_detection_segments: Number of segments to consider for the language detection. compression_ratio_threshold: If the gzip compression ratio is above this value, treat as failed. log_prob_threshold: If the average log probability over sampled tokens is @@ -423,7 +484,7 @@ def transcribe( initial_prompt=initial_prompt, prefix=prefix, suppress_blank=suppress_blank, - suppress_tokens=get_suppressed_tokens(self.tokenizer, suppress_tokens), + suppress_tokens=get_suppressed_tokens(tokenizer, suppress_tokens), prepend_punctuations=prepend_punctuations, append_punctuations=append_punctuations, max_new_tokens=max_new_tokens, @@ -445,12 +506,13 @@ def transcribe( duration=duration, duration_after_vad=duration_after_vad, transcription_options=options, - vad_options=None, + vad_options=vad_parameters, all_language_probs=all_language_probs, ) segments = self._batched_segments_generator( features, + tokenizer, chunks_metadata, batch_size, options, @@ -460,13 +522,14 @@ def transcribe( return segments, info def _batched_segments_generator( - self, features, chunks_metadata, batch_size, options, log_progress + self, features, tokenizer, chunks_metadata, batch_size, options, log_progress ): pbar = tqdm(total=len(features), disable=not log_progress, position=0) seg_idx = 0 for i in range(0, len(features), batch_size): results = self.forward( features[i : i + batch_size], + tokenizer, chunks_metadata[i : i + batch_size], options, ) From 08784ed5f3c543021b3884f38d8a31f15554c03e Mon Sep 17 00:00:00 2001 From: MahmoudAshraf97 Date: Wed, 13 Nov 2024 18:07:04 +0200 Subject: [PATCH 3/8] add correct temperature to output segments --- faster_whisper/transcribe.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py index 17de4584..b67448fa 100644 --- a/faster_whisper/transcribe.py +++ b/faster_whisper/transcribe.py @@ -57,7 +57,7 @@ class Segment: compression_ratio: float no_speech_prob: float words: Optional[List[Word]] - temperature: Optional[float] = 1.0 + temperature: Optional[float] def _asdict(self): warn( @@ -552,6 +552,7 @@ def _batched_segments_generator( avg_logprob=segment["avg_logprob"], no_speech_prob=segment["no_speech_prob"], compression_ratio=segment["compression_ratio"], + temperature=options.temperatures[0], ) pbar.update(1) From d7d96ed3018fa6492ddd60b8de763adcb4490513 Mon Sep 17 00:00:00 2001 From: MahmoudAshraf97 Date: Wed, 13 Nov 2024 23:21:24 +0200 Subject: [PATCH 4/8] disable prefix --- faster_whisper/transcribe.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py index b67448fa..4923d80b 100644 --- a/faster_whisper/transcribe.py +++ b/faster_whisper/transcribe.py @@ -135,7 +135,6 @@ def generate_segment_batched( else [] ), without_timestamps=options.without_timestamps, - prefix=options.prefix, hotwords=options.hotwords, ) @@ -268,19 +267,25 @@ def transcribe( log_prob_threshold: Optional[float] = -1.0, log_prob_low_threshold: Optional[float] = None, no_speech_threshold: Optional[float] = 0.6, + condition_on_previous_text: bool = True, + prompt_reset_on_temperature: float = 0.5, initial_prompt: Optional[Union[str, Iterable[int]]] = None, prefix: Optional[str] = None, suppress_blank: bool = True, suppress_tokens: Optional[List[int]] = [-1], without_timestamps: bool = True, + max_initial_timestamp: float = 1.0, word_timestamps: bool = False, prepend_punctuations: str = "\"'“¿([{-", append_punctuations: str = "\"'.。,,!!??::”)]}、", + multilingual: bool = False, + output_language: Optional[str] = None, vad_filter: bool = True, vad_parameters: Optional[Union[dict, VadOptions]] = None, max_new_tokens: Optional[int] = None, chunk_length: Optional[int] = None, clip_timestamps: Optional[List[dict]] = None, + hallucination_silence_threshold: Optional[float] = None, batch_size: int = 8, hotwords: Optional[str] = None, language_detection_threshold: Optional[float] = 0.5, @@ -306,7 +311,6 @@ def transcribe( only the first value is used. initial_prompt: Optional text string or iterable of token ids to provide as a prompt for the each window. - prefix: Optional text to provide as a prefix at the beginning of each window. suppress_blank: Suppress blank outputs at the beginning of the sampling. suppress_tokens: List of token IDs to suppress. -1 will suppress a default set of symbols as defined in `tokenizer.non_speech_tokens()`. @@ -338,11 +342,6 @@ def transcribe( language_detection_segments: Number of segments to consider for the language detection. Unused Arguments - max_initial_timestamp: The initial timestamp cannot be later than this, set at 0.0. - multilingual: If True, perform transcription on multilingual videos. Set as False. - output_language: Valid only if multilingual is set to True. - Specifies the string representing the output language. One of - 'en' (English) or 'hybrid' (code-switched transcription). set as None. compression_ratio_threshold: If the gzip compression ratio is above this value, treat as failed. log_prob_threshold: If the average log probability over sampled tokens is @@ -353,16 +352,21 @@ def transcribe( no_speech_threshold: If the no_speech probability is higher than this value AND the average log probability over sampled tokens is below `log_prob_threshold`, consider the segment as silent. - hallucination_silence_threshold: Optional[float] - When word_timestamps is True, skip silent periods longer than this threshold - (in seconds) when a possible hallucination is detected. set as None. condition_on_previous_text: If True, the previous output of the model is provided as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop, such as repetition looping or timestamps going out of sync. Set as False prompt_reset_on_temperature: Resets prompt if temperature is above this value. Arg has effect only if condition_on_previous_text is True. Set at 0.5 - + prefix: Optional text to provide as a prefix at the beginning of each window. + max_initial_timestamp: The initial timestamp cannot be later than this, set at 0.0. + multilingual: If True, perform transcription on multilingual videos. Set as False. + output_language: Valid only if multilingual is set to True. + Specifies the string representing the output language. One of + 'en' (English) or 'hybrid' (code-switched transcription). set as None. + hallucination_silence_threshold: Optional[float] + When word_timestamps is True, skip silent periods longer than this threshold + (in seconds) when a possible hallucination is detected. set as None. Returns: A tuple with: From cce3c675ed582ef9432c51772bfa40d3b02aa55b Mon Sep 17 00:00:00 2001 From: MahmoudAshraf97 Date: Sat, 16 Nov 2024 13:01:45 +0200 Subject: [PATCH 5/8] reduce diff [skip ci] --- faster_whisper/transcribe.py | 65 +++++++++++++++++++++++++++++++----- 1 file changed, 57 insertions(+), 8 deletions(-) diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py index 4923d80b..6ddf6e5f 100644 --- a/faster_whisper/transcribe.py +++ b/faster_whisper/transcribe.py @@ -243,11 +243,45 @@ def forward(self, features, tokenizer, chunks_metadata, options): return segmented_outputs + def get_language_and_tokenizer( + self, audio, task: Optional[str] = None, language: Optional[str] = None + ): + all_language_probs = None + language_probability = 1.0 + + if self.tokenizer is None: + if not language: + ( + language, + language_probability, + all_language_probs, + ) = self.model.detect_language(audio) + task = task or "transcribe" + self.tokenizer = Tokenizer( + self.model.hf_tokenizer, + self.model.model.is_multilingual, + task=task, + language=language, + ) + else: + if task is not None: + self.tokenizer.task = self.tokenizer.tokenizer.token_to_id( + f"<|{task}|>" + ) + + if language is not None: + self.tokenizer.language = self.tokenizer.tokenizer.token_to_id( + f"<|{language}|>" + ) + self.tokenizer.language_code = language + + return language, language_probability, task, all_language_probs + def transcribe( self, audio: Union[str, BinaryIO, np.ndarray], language: Optional[str] = None, - task: str = "transcribe", + task: str = None, log_progress: bool = False, beam_size: int = 5, best_of: int = 5, @@ -288,8 +322,6 @@ def transcribe( hallucination_silence_threshold: Optional[float] = None, batch_size: int = 8, hotwords: Optional[str] = None, - language_detection_threshold: Optional[float] = 0.5, - language_detection_segments: int = 1, ) -> Tuple[Iterable[Segment], TranscriptionInfo]: """transcribe audio in chunks in batched fashion and return with language info. @@ -337,9 +369,6 @@ def transcribe( batch_size: the maximum number of parallel requests to model for decoding. hotwords: Hotwords/hint phrases to the model. Has no effect if prefix is not None. - language_detection_threshold: If the maximum probability of the language tokens is - higher than this value, the language is detected. - language_detection_segments: Number of segments to consider for the language detection. Unused Arguments compression_ratio_threshold: If the gzip compression ratio is above this value, @@ -488,7 +517,7 @@ def transcribe( initial_prompt=initial_prompt, prefix=prefix, suppress_blank=suppress_blank, - suppress_tokens=get_suppressed_tokens(tokenizer, suppress_tokens), + suppress_tokens=get_suppressed_tokens(self.tokenizer, suppress_tokens), prepend_punctuations=prepend_punctuations, append_punctuations=append_punctuations, max_new_tokens=max_new_tokens, @@ -513,10 +542,27 @@ def transcribe( vad_options=vad_parameters, all_language_probs=all_language_probs, ) + + audio_chunks, chunks_metadata = collect_chunks(audio, clip_timestamps) + features = ( + np.stack( + [ + pad_or_trim( + self.model.feature_extractor(chunk)[ + ..., + : chunk.shape[0] // self.model.feature_extractor.hop_length, + ] + ) + for chunk in audio_chunks + ] + ) + if duration_after_vad + else [] + ) segments = self._batched_segments_generator( features, - tokenizer, + self.tokenizer, chunks_metadata, batch_size, options, @@ -562,6 +608,9 @@ def _batched_segments_generator( pbar.update(1) pbar.close() + # revert the tokenizer if multilingual inference is enabled + if self.preset_language is None: + self.tokenizer = None self.last_speech_timestamp = 0.0 From aededdf5ccf0b5766580ad6665abd908b53792d2 Mon Sep 17 00:00:00 2001 From: MahmoudAshraf97 Date: Sat, 16 Nov 2024 13:03:25 +0200 Subject: [PATCH 6/8] . [skip ci] --- faster_whisper/transcribe.py | 57 +++++++++++++++++++++++++++++++++--- 1 file changed, 53 insertions(+), 4 deletions(-) diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py index 6ddf6e5f..7325a6c3 100644 --- a/faster_whisper/transcribe.py +++ b/faster_whisper/transcribe.py @@ -119,6 +119,58 @@ def __init__( self.model: WhisperModel = model self.last_speech_timestamp = 0.0 + def forward(self, features, tokenizer, chunks_metadata, options): + encoder_output, outputs = self.generate_segment_batched( + features, tokenizer, options + ) + + segmented_outputs = [] + segment_sizes = [] + for chunk_metadata, output in zip(chunks_metadata, outputs): + duration = chunk_metadata["end_time"] - chunk_metadata["start_time"] + segment_size = int(ceil(duration) * self.model.frames_per_second) + segment_sizes.append(segment_size) + ( + subsegments, + seek, + single_timestamp_ending, + ) = self.model._split_segments_by_timestamps( + tokenizer=tokenizer, + tokens=output["tokens"], + time_offset=chunk_metadata["start_time"], + segment_size=segment_size, + segment_duration=duration, + seek=0, + ) + segmented_outputs.append( + [ + dict( + text=tokenizer.decode(subsegment["tokens"]), + avg_logprob=output["avg_logprob"], + no_speech_prob=output["no_speech_prob"], + tokens=subsegment["tokens"], + start=subsegment["start"], + end=subsegment["end"], + compression_ratio=get_compression_ratio( + tokenizer.decode(subsegment["tokens"]) + ), + ) + for subsegment in subsegments + ] + ) + if options.word_timestamps: + self.last_speech_timestamp = self.model.add_word_timestamps( + segmented_outputs, + tokenizer, + encoder_output, + segment_sizes, + options.prepend_punctuations, + options.append_punctuations, + self.last_speech_timestamp, + ) + + return segmented_outputs + def generate_segment_batched( self, features: np.ndarray, @@ -223,9 +275,6 @@ def forward(self, features, tokenizer, chunks_metadata, options): compression_ratio=get_compression_ratio( tokenizer.decode(subsegment["tokens"]) ), - seek=int( - chunk_metadata["start_time"] * self.model.frames_per_second - ), ) for subsegment in subsegments ] @@ -542,7 +591,7 @@ def transcribe( vad_options=vad_parameters, all_language_probs=all_language_probs, ) - + audio_chunks, chunks_metadata = collect_chunks(audio, clip_timestamps) features = ( np.stack( From 9ae92c049f2f2670726c200f66321e0a36fd1a57 Mon Sep 17 00:00:00 2001 From: MahmoudAshraf97 Date: Sat, 16 Nov 2024 13:23:36 +0200 Subject: [PATCH 7/8] fix rebase [skip ci] --- faster_whisper/transcribe.py | 113 ++--------------------------------- 1 file changed, 6 insertions(+), 107 deletions(-) diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py index 7325a6c3..01c1e67e 100644 --- a/faster_whisper/transcribe.py +++ b/faster_whisper/transcribe.py @@ -240,97 +240,11 @@ def generate_segment_batched( return encoder_output, output - def forward(self, features, tokenizer, chunks_metadata, options): - encoder_output, outputs = self.generate_segment_batched( - features, tokenizer, options - ) - - segmented_outputs = [] - segment_sizes = [] - for chunk_metadata, output in zip(chunks_metadata, outputs): - duration = chunk_metadata["end_time"] - chunk_metadata["start_time"] - segment_size = int(ceil(duration) * self.model.frames_per_second) - segment_sizes.append(segment_size) - ( - subsegments, - seek, - single_timestamp_ending, - ) = self.model._split_segments_by_timestamps( - tokenizer=tokenizer, - tokens=output["tokens"], - time_offset=chunk_metadata["start_time"], - segment_size=segment_size, - segment_duration=duration, - seek=0, - ) - segmented_outputs.append( - [ - dict( - text=tokenizer.decode(subsegment["tokens"]), - avg_logprob=output["avg_logprob"], - no_speech_prob=output["no_speech_prob"], - tokens=subsegment["tokens"], - start=subsegment["start"], - end=subsegment["end"], - compression_ratio=get_compression_ratio( - tokenizer.decode(subsegment["tokens"]) - ), - ) - for subsegment in subsegments - ] - ) - if options.word_timestamps: - self.last_speech_timestamp = self.model.add_word_timestamps( - segmented_outputs, - tokenizer, - encoder_output, - segment_sizes, - options.prepend_punctuations, - options.append_punctuations, - self.last_speech_timestamp, - ) - - return segmented_outputs - - def get_language_and_tokenizer( - self, audio, task: Optional[str] = None, language: Optional[str] = None - ): - all_language_probs = None - language_probability = 1.0 - - if self.tokenizer is None: - if not language: - ( - language, - language_probability, - all_language_probs, - ) = self.model.detect_language(audio) - task = task or "transcribe" - self.tokenizer = Tokenizer( - self.model.hf_tokenizer, - self.model.model.is_multilingual, - task=task, - language=language, - ) - else: - if task is not None: - self.tokenizer.task = self.tokenizer.tokenizer.token_to_id( - f"<|{task}|>" - ) - - if language is not None: - self.tokenizer.language = self.tokenizer.tokenizer.token_to_id( - f"<|{language}|>" - ) - self.tokenizer.language_code = language - - return language, language_probability, task, all_language_probs - def transcribe( self, audio: Union[str, BinaryIO, np.ndarray], language: Optional[str] = None, - task: str = None, + task: str = "transcribe", log_progress: bool = False, beam_size: int = 5, best_of: int = 5, @@ -371,6 +285,8 @@ def transcribe( hallucination_silence_threshold: Optional[float] = None, batch_size: int = 8, hotwords: Optional[str] = None, + language_detection_threshold: Optional[float] = 0.5, + language_detection_segments: int = 1, ) -> Tuple[Iterable[Segment], TranscriptionInfo]: """transcribe audio in chunks in batched fashion and return with language info. @@ -418,6 +334,9 @@ def transcribe( batch_size: the maximum number of parallel requests to model for decoding. hotwords: Hotwords/hint phrases to the model. Has no effect if prefix is not None. + language_detection_threshold: If the maximum probability of the language tokens is + higher than this value, the language is detected. + language_detection_segments: Number of segments to consider for the language detection. Unused Arguments compression_ratio_threshold: If the gzip compression ratio is above this value, @@ -592,23 +511,6 @@ def transcribe( all_language_probs=all_language_probs, ) - audio_chunks, chunks_metadata = collect_chunks(audio, clip_timestamps) - features = ( - np.stack( - [ - pad_or_trim( - self.model.feature_extractor(chunk)[ - ..., - : chunk.shape[0] // self.model.feature_extractor.hop_length, - ] - ) - for chunk in audio_chunks - ] - ) - if duration_after_vad - else [] - ) - segments = self._batched_segments_generator( features, self.tokenizer, @@ -657,9 +559,6 @@ def _batched_segments_generator( pbar.update(1) pbar.close() - # revert the tokenizer if multilingual inference is enabled - if self.preset_language is None: - self.tokenizer = None self.last_speech_timestamp = 0.0 From 07fb5758f4dc7e0a11496672848e0411df000d25 Mon Sep 17 00:00:00 2001 From: MahmoudAshraf97 Date: Sat, 16 Nov 2024 13:28:57 +0200 Subject: [PATCH 8/8] fix rebase --- faster_whisper/transcribe.py | 63 ++++-------------------------------- 1 file changed, 6 insertions(+), 57 deletions(-) diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py index 01c1e67e..bf091981 100644 --- a/faster_whisper/transcribe.py +++ b/faster_whisper/transcribe.py @@ -154,6 +154,9 @@ def forward(self, features, tokenizer, chunks_metadata, options): compression_ratio=get_compression_ratio( tokenizer.decode(subsegment["tokens"]) ), + seek=int( + chunk_metadata["start_time"] * self.model.frames_per_second + ), ) for subsegment in subsegments ] @@ -455,7 +458,7 @@ def transcribe( language_probability = 1 - self.tokenizer = Tokenizer( + tokenizer = Tokenizer( self.model.hf_tokenizer, self.model.model.is_multilingual, task=task, @@ -485,7 +488,7 @@ def transcribe( initial_prompt=initial_prompt, prefix=prefix, suppress_blank=suppress_blank, - suppress_tokens=get_suppressed_tokens(self.tokenizer, suppress_tokens), + suppress_tokens=get_suppressed_tokens(tokenizer, suppress_tokens), prepend_punctuations=prepend_punctuations, append_punctuations=append_punctuations, max_new_tokens=max_new_tokens, @@ -513,7 +516,7 @@ def transcribe( segments = self._batched_segments_generator( features, - self.tokenizer, + tokenizer, chunks_metadata, batch_size, options, @@ -1738,60 +1741,6 @@ def find_alignment( ) return return_list - def generate_segment_batched( - self, - features: np.ndarray, - tokenizer: Tokenizer, - options: TranscriptionOptions, - ): - batch_size = features.shape[0] - - prompt = self.get_prompt( - tokenizer, - previous_tokens=( - tokenizer.encode(options.initial_prompt) - if options.initial_prompt is not None - else [] - ), - without_timestamps=options.without_timestamps, - prefix=options.prefix, - hotwords=options.hotwords, - ) - - encoder_output = self.encode(features) - - results = self.model.generate( - encoder_output, - [prompt] * batch_size, - beam_size=options.beam_size, - patience=options.patience, - length_penalty=options.length_penalty, - max_length=self.max_length, - suppress_blank=options.suppress_blank, - suppress_tokens=options.suppress_tokens, - return_scores=True, - return_no_speech_prob=True, - sampling_temperature=options.temperatures[0], - repetition_penalty=options.repetition_penalty, - no_repeat_ngram_size=options.no_repeat_ngram_size, - ) - - output = [] - for result in results: - # return scores - seq_len = len(result.sequences_ids[0]) - cum_logprob = result.scores[0] * (seq_len**options.length_penalty) - - output.append( - dict( - avg_logprob=cum_logprob / (seq_len + 1), - no_speech_prob=result.no_speech_prob, - tokens=result.sequences_ids[0], - ) - ) - - return encoder_output, output - def detect_language( self, audio: Optional[np.ndarray] = None,