From bfeb7c3df4cef3ed526c85ef7708a5112dc5495e Mon Sep 17 00:00:00 2001
From: MahmoudAshraf97 <hassouna97.ma@gmail.com>
Date: Wed, 13 Nov 2024 14:34:51 +0200
Subject: [PATCH 1/8] initial cleanup

---
 faster_whisper/transcribe.py | 139 ++++++++++++++++-------------------
 1 file changed, 63 insertions(+), 76 deletions(-)

diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py
index d3d2bdf7..6f4b67b0 100644
--- a/faster_whisper/transcribe.py
+++ b/faster_whisper/transcribe.py
@@ -68,7 +68,6 @@ def _asdict(self):
         return asdict(self)
 
 
-# Added additional parameters for multilingual videos and fixes below
 @dataclass
 class TranscriptionOptions:
     beam_size: int
@@ -112,18 +111,7 @@ class TranscriptionInfo:
     vad_options: VadOptions
 
 
-# The code below is originally from HF pipeline and is used in whisper-x
-# (https://github.com/m-bain/whisperX) and adapted for faster_whisper
-
-
 class BatchedInferencePipeline:
-    """
-    Huggingface Pipeline wrapper for WhisperModel.
-    Copyright (c) 2022, Max Bain
-    All rights reserved.
-    Modified by Mobius Labs GmbH
-    """
-
     def __init__(
         self,
         model,
@@ -137,9 +125,9 @@ def __init__(
         self.preset_language = language
         self.last_speech_timestamp = 0.0
 
-    def forward(self, features, chunks_metadata, **forward_params):
+    def forward(self, features, chunks_metadata, options):
         encoder_output, outputs = self.model.generate_segment_batched(
-            features, self.tokenizer, forward_params
+            features, self.tokenizer, options
         )
 
         segmented_outputs = []
@@ -179,14 +167,14 @@ def forward(self, features, chunks_metadata, **forward_params):
                     for subsegment in subsegments
                 ]
             )
-        if forward_params["word_timestamps"]:
+        if options.word_timestamps:
             self.last_speech_timestamp = self.model.add_word_timestamps(
                 segmented_outputs,
                 self.tokenizer,
                 encoder_output,
                 segment_sizes,
-                forward_params["prepend_punctuations"],
-                forward_params["append_punctuations"],
+                options.prepend_punctuations,
+                options.append_punctuations,
                 self.last_speech_timestamp,
             )
 
@@ -229,7 +217,7 @@ def transcribe(
         max_new_tokens: Optional[int] = None,
         chunk_length: Optional[int] = None,
         clip_timestamps: Optional[List[dict]] = None,
-        batch_size: int = 16,
+        batch_size: int = 8,
         hotwords: Optional[str] = None,
         language_detection_threshold: Optional[float] = 0.5,
         language_detection_segments: int = 1,
@@ -250,22 +238,11 @@ def transcribe(
             repetition_penalty: Penalty applied to the score of previously generated tokens
                 (set > 1 to penalize).
             no_repeat_ngram_size: Prevent repetitions of ngrams with this size (set 0 to disable).
-            temperature: Temperature for sampling. It can be a tuple of temperatures,
-                which will be successively used upon failures according to either
-                `compression_ratio_threshold` or `log_prob_threshold`.
-            compression_ratio_threshold: If the gzip compression ratio is above this value,
-                treat as failed.
-            log_prob_threshold: If the average log probability over sampled tokens is
-                below this value, treat as failed.
-            log_prob_low_threshold: This parameter alone is sufficient to skip an output text,
-                whereas log_prob_threshold also looks for appropriate no_speech_threshold value.
-                This value should be less than log_prob_threshold.
-            no_speech_threshold: If the no_speech probability is higher than this value AND
-                the average log probability over sampled tokens is below `log_prob_threshold`,
-                consider the segment as silent.
+            temperature: Temperature for sampling. If a list or tuple is passed,
+                only the first value is used.
             initial_prompt: Optional text string or iterable of token ids to provide as a
-                prompt for the first window.
-            prefix: Optional text to provide as a prefix for the first window.
+                prompt for the each window.
+            prefix: Optional text to provide as a prefix at the beginning of each window.
             suppress_blank: Suppress blank outputs at the beginning of the sampling.
             suppress_tokens: List of token IDs to suppress. -1 will suppress a default set
                 of symbols as defined in `tokenizer.non_speech_tokens()`.
@@ -296,28 +273,34 @@ def transcribe(
                 higher than this value, the language is detected.
             language_detection_segments: Number of segments to consider for the language detection.
 
-        Static params: (Fixed for batched version)
+        Unused Arguments
             max_initial_timestamp: The initial timestamp cannot be later than this, set at 0.0.
             multilingual: If True, perform transcription on multilingual videos. Set as False.
             output_language: Valid only if multilingual is set to True.
                 Specifies the string representing the output language. One of
                 'en' (English) or 'hybrid' (code-switched transcription). set as None.
+            language_detection_threshold: If the maximum probability of the language tokens is
+                higher than this value, the language is detected.
+            language_detection_segments: Number of segments to consider for the language detection.
+            compression_ratio_threshold: If the gzip compression ratio is above this value,
+                treat as failed.
+            log_prob_threshold: If the average log probability over sampled tokens is
+                below this value, treat as failed.
+            log_prob_low_threshold: This parameter alone is sufficient to skip an output text,
+                whereas log_prob_threshold also looks for appropriate no_speech_threshold value.
+                This value should be less than log_prob_threshold.
+            no_speech_threshold: If the no_speech probability is higher than this value AND
+                the average log probability over sampled tokens is below `log_prob_threshold`,
+                consider the segment as silent.
+            hallucination_silence_threshold: Optional[float]
+                When word_timestamps is True, skip silent periods longer than this threshold
+                (in seconds) when a possible hallucination is detected. set as None.
             condition_on_previous_text: If True, the previous output of the model is provided
                 as a prompt for the next window; disabling may make the text inconsistent across
                 windows, but the model becomes less prone to getting stuck in a failure loop,
                 such as repetition looping or timestamps going out of sync. Set as False
             prompt_reset_on_temperature: Resets prompt if temperature is above this value.
                 Arg has effect only if condition_on_previous_text is True. Set at 0.5
-            #TODO: support "hallucination_silence_threshold" when "word_timestamps=True"
-            hallucination_silence_threshold: Optional[float]
-                When word_timestamps is True, skip silent periods longer than this threshold
-                (in seconds) when a possible hallucination is detected. set as None.
-
-        unused:
-            language_detection_threshold: If the maximum probability of the language tokens is
-                higher than this value, the language is detected.
-            language_detection_segments: Number of segments to consider for the language detection.
-
 
         Returns:
           A tuple with:
@@ -421,8 +404,7 @@ def transcribe(
             np.stack([pad_or_trim(feature) for feature in features]) if features else []
         )
 
-        # batched options: see the difference with default options in WhisperModel
-        batched_options = TranscriptionOptions(
+        options = TranscriptionOptions(
             beam_size=beam_size,
             best_of=best_of,
             patience=patience,
@@ -434,7 +416,9 @@ def transcribe(
             no_speech_threshold=no_speech_threshold,
             compression_ratio_threshold=compression_ratio_threshold,
             temperatures=(
-                temperature if isinstance(temperature, (list, tuple)) else [temperature]
+                temperature[:1]
+                if isinstance(temperature, (list, tuple))
+                else [temperature]
             ),
             initial_prompt=initial_prompt,
             prefix=prefix,
@@ -447,7 +431,7 @@ def transcribe(
             word_timestamps=word_timestamps,
             hallucination_silence_threshold=None,
             condition_on_previous_text=False,
-            clip_timestamps="0",
+            clip_timestamps=clip_timestamps,
             prompt_reset_on_temperature=0.5,
             multilingual=False,
             output_language=None,
@@ -460,7 +444,7 @@ def transcribe(
             language_probability=language_probability,
             duration=duration,
             duration_after_vad=duration_after_vad,
-            transcription_options=batched_options,
+            transcription_options=options,
             vad_options=None,
             all_language_probs=all_language_probs,
         )
@@ -469,7 +453,7 @@ def transcribe(
             features,
             chunks_metadata,
             batch_size,
-            batched_options,
+            options,
             log_progress,
         )
 
@@ -484,7 +468,7 @@ def _batched_segments_generator(
             results = self.forward(
                 features[i : i + batch_size],
                 chunks_metadata[i : i + batch_size],
-                **asdict(options),
+                options,
             )
 
             for result in results:
@@ -1693,50 +1677,53 @@ def generate_segment_batched(
         self,
         features: np.ndarray,
         tokenizer: Tokenizer,
-        options: dict,
+        options: TranscriptionOptions,
     ):
         batch_size = features.shape[0]
-        all_tokens = []
-        prompt_reset_since = 0
 
-        if options["initial_prompt"] is not None:
-            initial_prompt = " " + options["initial_prompt"].strip()
-            initial_prompt_tokens = tokenizer.encode(initial_prompt)
-            all_tokens.extend(initial_prompt_tokens)
-        previous_tokens = all_tokens[prompt_reset_since:]
         prompt = self.get_prompt(
             tokenizer,
-            previous_tokens,
-            without_timestamps=options["without_timestamps"],
-            prefix=options["prefix"],
+            previous_tokens=(
+                tokenizer.encode(options.initial_prompt)
+                if options.initial_prompt is not None
+                else []
+            ),
+            without_timestamps=options.without_timestamps,
+            prefix=options.prefix,
+            hotwords=options.hotwords,
         )
 
         encoder_output = self.encode(features)
 
-        result = self.model.generate(
+        results = self.model.generate(
             encoder_output,
             [prompt] * batch_size,
-            beam_size=options["beam_size"],
-            patience=options["patience"],
-            length_penalty=options["length_penalty"],
+            beam_size=options.beam_size,
+            patience=options.patience,
+            length_penalty=options.length_penalty,
             max_length=self.max_length,
-            suppress_blank=options["suppress_blank"],
-            suppress_tokens=options["suppress_tokens"],
+            suppress_blank=options.suppress_blank,
+            suppress_tokens=options.suppress_tokens,
             return_scores=True,
             return_no_speech_prob=True,
+            sampling_temperature=options.temperatures[0],
+            repetition_penalty=options.repetition_penalty,
+            no_repeat_ngram_size=options.no_repeat_ngram_size,
         )
 
         output = []
-        for res in result:
-            output.append({})
+        for result in results:
             # return scores
-            seq_len = len(res.sequences_ids[0])
-            cum_logprob = res.scores[0] * (seq_len ** options["length_penalty"])
-            output[-1]["avg_logprob"] = cum_logprob / (seq_len + 1)
+            seq_len = len(result.sequences_ids[0])
+            cum_logprob = result.scores[0] * (seq_len**options.length_penalty)
 
-            # return no speech prob
-            output[-1]["no_speech_prob"] = res.no_speech_prob
-            output[-1]["tokens"] = res.sequences_ids[0]
+            output.append(
+                dict(
+                    avg_logprob=cum_logprob / (seq_len + 1),
+                    no_speech_prob=result.no_speech_prob,
+                    tokens=result.sequences_ids[0],
+                )
+            )
 
         return encoder_output, output
 

From 17b0be0be21e5cddd69190cf63a39870b00c03ae Mon Sep 17 00:00:00 2001
From: MahmoudAshraf97 <hassouna97.ma@gmail.com>
Date: Wed, 13 Nov 2024 17:39:38 +0200
Subject: [PATCH 2/8] .

---
 faster_whisper/transcribe.py | 101 ++++++++++++++++++++++++++++-------
 1 file changed, 82 insertions(+), 19 deletions(-)

diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py
index 6f4b67b0..17de4584 100644
--- a/faster_whisper/transcribe.py
+++ b/faster_whisper/transcribe.py
@@ -115,19 +115,83 @@ class BatchedInferencePipeline:
     def __init__(
         self,
         model,
-        options: Optional[TranscriptionOptions] = None,
-        tokenizer=None,
-        language: Optional[str] = None,
     ):
         self.model: WhisperModel = model
-        self.tokenizer = tokenizer
-        self.options = options
-        self.preset_language = language
         self.last_speech_timestamp = 0.0
 
-    def forward(self, features, chunks_metadata, options):
-        encoder_output, outputs = self.model.generate_segment_batched(
-            features, self.tokenizer, options
+    def generate_segment_batched(
+        self,
+        features: np.ndarray,
+        tokenizer: Tokenizer,
+        options: TranscriptionOptions,
+    ):
+        batch_size = features.shape[0]
+
+        prompt = self.model.get_prompt(
+            tokenizer,
+            previous_tokens=(
+                tokenizer.encode(options.initial_prompt)
+                if options.initial_prompt is not None
+                else []
+            ),
+            without_timestamps=options.without_timestamps,
+            prefix=options.prefix,
+            hotwords=options.hotwords,
+        )
+
+        if options.max_new_tokens is not None:
+            max_length = len(prompt) + options.max_new_tokens
+        else:
+            max_length = self.model.max_length
+
+        if max_length > self.model.max_length:
+            raise ValueError(
+                f"The length of the prompt is {len(prompt)}, and the `max_new_tokens` "
+                f"{max_length - len(prompt)}. Thus, the combined length of the prompt "
+                f"and `max_new_tokens` is: {max_length}. This exceeds the "
+                f"`max_length` of the Whisper model: {self.model.max_length}. "
+                "You should either reduce the length of your prompt, or "
+                "reduce the value of `max_new_tokens`, "
+                f"so that their combined length is less that {self.model.max_length}."
+            )
+
+        encoder_output = self.model.encode(features)
+
+        results = self.model.model.generate(
+            encoder_output,
+            [prompt] * batch_size,
+            beam_size=options.beam_size,
+            patience=options.patience,
+            length_penalty=options.length_penalty,
+            max_length=max_length,
+            suppress_blank=options.suppress_blank,
+            suppress_tokens=options.suppress_tokens,
+            return_scores=True,
+            return_no_speech_prob=True,
+            sampling_temperature=options.temperatures[0],
+            repetition_penalty=options.repetition_penalty,
+            no_repeat_ngram_size=options.no_repeat_ngram_size,
+        )
+
+        output = []
+        for result in results:
+            # return scores
+            seq_len = len(result.sequences_ids[0])
+            cum_logprob = result.scores[0] * (seq_len**options.length_penalty)
+
+            output.append(
+                dict(
+                    avg_logprob=cum_logprob / (seq_len + 1),
+                    no_speech_prob=result.no_speech_prob,
+                    tokens=result.sequences_ids[0],
+                )
+            )
+
+        return encoder_output, output
+
+    def forward(self, features, tokenizer, chunks_metadata, options):
+        encoder_output, outputs = self.generate_segment_batched(
+            features, tokenizer, options
         )
 
         segmented_outputs = []
@@ -141,7 +205,7 @@ def forward(self, features, chunks_metadata, options):
                 seek,
                 single_timestamp_ending,
             ) = self.model._split_segments_by_timestamps(
-                tokenizer=self.tokenizer,
+                tokenizer=tokenizer,
                 tokens=output["tokens"],
                 time_offset=chunk_metadata["start_time"],
                 segment_size=segment_size,
@@ -151,14 +215,14 @@ def forward(self, features, chunks_metadata, options):
             segmented_outputs.append(
                 [
                     dict(
-                        text=self.tokenizer.decode(subsegment["tokens"]),
+                        text=tokenizer.decode(subsegment["tokens"]),
                         avg_logprob=output["avg_logprob"],
                         no_speech_prob=output["no_speech_prob"],
                         tokens=subsegment["tokens"],
                         start=subsegment["start"],
                         end=subsegment["end"],
                         compression_ratio=get_compression_ratio(
-                            self.tokenizer.decode(subsegment["tokens"])
+                            tokenizer.decode(subsegment["tokens"])
                         ),
                         seek=int(
                             chunk_metadata["start_time"] * self.model.frames_per_second
@@ -170,7 +234,7 @@ def forward(self, features, chunks_metadata, options):
         if options.word_timestamps:
             self.last_speech_timestamp = self.model.add_word_timestamps(
                 segmented_outputs,
-                self.tokenizer,
+                tokenizer,
                 encoder_output,
                 segment_sizes,
                 options.prepend_punctuations,
@@ -279,9 +343,6 @@ def transcribe(
             output_language: Valid only if multilingual is set to True.
                 Specifies the string representing the output language. One of
                 'en' (English) or 'hybrid' (code-switched transcription). set as None.
-            language_detection_threshold: If the maximum probability of the language tokens is
-                higher than this value, the language is detected.
-            language_detection_segments: Number of segments to consider for the language detection.
             compression_ratio_threshold: If the gzip compression ratio is above this value,
                 treat as failed.
             log_prob_threshold: If the average log probability over sampled tokens is
@@ -423,7 +484,7 @@ def transcribe(
             initial_prompt=initial_prompt,
             prefix=prefix,
             suppress_blank=suppress_blank,
-            suppress_tokens=get_suppressed_tokens(self.tokenizer, suppress_tokens),
+            suppress_tokens=get_suppressed_tokens(tokenizer, suppress_tokens),
             prepend_punctuations=prepend_punctuations,
             append_punctuations=append_punctuations,
             max_new_tokens=max_new_tokens,
@@ -445,12 +506,13 @@ def transcribe(
             duration=duration,
             duration_after_vad=duration_after_vad,
             transcription_options=options,
-            vad_options=None,
+            vad_options=vad_parameters,
             all_language_probs=all_language_probs,
         )
 
         segments = self._batched_segments_generator(
             features,
+            tokenizer,
             chunks_metadata,
             batch_size,
             options,
@@ -460,13 +522,14 @@ def transcribe(
         return segments, info
 
     def _batched_segments_generator(
-        self, features, chunks_metadata, batch_size, options, log_progress
+        self, features, tokenizer, chunks_metadata, batch_size, options, log_progress
     ):
         pbar = tqdm(total=len(features), disable=not log_progress, position=0)
         seg_idx = 0
         for i in range(0, len(features), batch_size):
             results = self.forward(
                 features[i : i + batch_size],
+                tokenizer,
                 chunks_metadata[i : i + batch_size],
                 options,
             )

From 08784ed5f3c543021b3884f38d8a31f15554c03e Mon Sep 17 00:00:00 2001
From: MahmoudAshraf97 <hassouna97.ma@gmail.com>
Date: Wed, 13 Nov 2024 18:07:04 +0200
Subject: [PATCH 3/8] add correct temperature to output segments

---
 faster_whisper/transcribe.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py
index 17de4584..b67448fa 100644
--- a/faster_whisper/transcribe.py
+++ b/faster_whisper/transcribe.py
@@ -57,7 +57,7 @@ class Segment:
     compression_ratio: float
     no_speech_prob: float
     words: Optional[List[Word]]
-    temperature: Optional[float] = 1.0
+    temperature: Optional[float]
 
     def _asdict(self):
         warn(
@@ -552,6 +552,7 @@ def _batched_segments_generator(
                         avg_logprob=segment["avg_logprob"],
                         no_speech_prob=segment["no_speech_prob"],
                         compression_ratio=segment["compression_ratio"],
+                        temperature=options.temperatures[0],
                     )
 
                 pbar.update(1)

From d7d96ed3018fa6492ddd60b8de763adcb4490513 Mon Sep 17 00:00:00 2001
From: MahmoudAshraf97 <hassouna97.ma@gmail.com>
Date: Wed, 13 Nov 2024 23:21:24 +0200
Subject: [PATCH 4/8] disable prefix

---
 faster_whisper/transcribe.py | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py
index b67448fa..4923d80b 100644
--- a/faster_whisper/transcribe.py
+++ b/faster_whisper/transcribe.py
@@ -135,7 +135,6 @@ def generate_segment_batched(
                 else []
             ),
             without_timestamps=options.without_timestamps,
-            prefix=options.prefix,
             hotwords=options.hotwords,
         )
 
@@ -268,19 +267,25 @@ def transcribe(
         log_prob_threshold: Optional[float] = -1.0,
         log_prob_low_threshold: Optional[float] = None,
         no_speech_threshold: Optional[float] = 0.6,
+        condition_on_previous_text: bool = True,
+        prompt_reset_on_temperature: float = 0.5,
         initial_prompt: Optional[Union[str, Iterable[int]]] = None,
         prefix: Optional[str] = None,
         suppress_blank: bool = True,
         suppress_tokens: Optional[List[int]] = [-1],
         without_timestamps: bool = True,
+        max_initial_timestamp: float = 1.0,
         word_timestamps: bool = False,
         prepend_punctuations: str = "\"'“¿([{-",
         append_punctuations: str = "\"'.。,，!！?？:：”)]}、",
+        multilingual: bool = False,
+        output_language: Optional[str] = None,
         vad_filter: bool = True,
         vad_parameters: Optional[Union[dict, VadOptions]] = None,
         max_new_tokens: Optional[int] = None,
         chunk_length: Optional[int] = None,
         clip_timestamps: Optional[List[dict]] = None,
+        hallucination_silence_threshold: Optional[float] = None,
         batch_size: int = 8,
         hotwords: Optional[str] = None,
         language_detection_threshold: Optional[float] = 0.5,
@@ -306,7 +311,6 @@ def transcribe(
                 only the first value is used.
             initial_prompt: Optional text string or iterable of token ids to provide as a
                 prompt for the each window.
-            prefix: Optional text to provide as a prefix at the beginning of each window.
             suppress_blank: Suppress blank outputs at the beginning of the sampling.
             suppress_tokens: List of token IDs to suppress. -1 will suppress a default set
                 of symbols as defined in `tokenizer.non_speech_tokens()`.
@@ -338,11 +342,6 @@ def transcribe(
             language_detection_segments: Number of segments to consider for the language detection.
 
         Unused Arguments
-            max_initial_timestamp: The initial timestamp cannot be later than this, set at 0.0.
-            multilingual: If True, perform transcription on multilingual videos. Set as False.
-            output_language: Valid only if multilingual is set to True.
-                Specifies the string representing the output language. One of
-                'en' (English) or 'hybrid' (code-switched transcription). set as None.
             compression_ratio_threshold: If the gzip compression ratio is above this value,
                 treat as failed.
             log_prob_threshold: If the average log probability over sampled tokens is
@@ -353,16 +352,21 @@ def transcribe(
             no_speech_threshold: If the no_speech probability is higher than this value AND
                 the average log probability over sampled tokens is below `log_prob_threshold`,
                 consider the segment as silent.
-            hallucination_silence_threshold: Optional[float]
-                When word_timestamps is True, skip silent periods longer than this threshold
-                (in seconds) when a possible hallucination is detected. set as None.
             condition_on_previous_text: If True, the previous output of the model is provided
                 as a prompt for the next window; disabling may make the text inconsistent across
                 windows, but the model becomes less prone to getting stuck in a failure loop,
                 such as repetition looping or timestamps going out of sync. Set as False
             prompt_reset_on_temperature: Resets prompt if temperature is above this value.
                 Arg has effect only if condition_on_previous_text is True. Set at 0.5
-
+            prefix: Optional text to provide as a prefix at the beginning of each window.
+            max_initial_timestamp: The initial timestamp cannot be later than this, set at 0.0.
+            multilingual: If True, perform transcription on multilingual videos. Set as False.
+            output_language: Valid only if multilingual is set to True.
+                Specifies the string representing the output language. One of
+                'en' (English) or 'hybrid' (code-switched transcription). set as None.
+            hallucination_silence_threshold: Optional[float]
+                When word_timestamps is True, skip silent periods longer than this threshold
+                (in seconds) when a possible hallucination is detected. set as None.
         Returns:
           A tuple with:
 

From cce3c675ed582ef9432c51772bfa40d3b02aa55b Mon Sep 17 00:00:00 2001
From: MahmoudAshraf97 <hassouna97.ma@gmail.com>
Date: Sat, 16 Nov 2024 13:01:45 +0200
Subject: [PATCH 5/8] reduce diff [skip ci]

---
 faster_whisper/transcribe.py | 65 +++++++++++++++++++++++++++++++-----
 1 file changed, 57 insertions(+), 8 deletions(-)

diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py
index 4923d80b..6ddf6e5f 100644
--- a/faster_whisper/transcribe.py
+++ b/faster_whisper/transcribe.py
@@ -243,11 +243,45 @@ def forward(self, features, tokenizer, chunks_metadata, options):
 
         return segmented_outputs
 
+    def get_language_and_tokenizer(
+        self, audio, task: Optional[str] = None, language: Optional[str] = None
+    ):
+        all_language_probs = None
+        language_probability = 1.0
+
+        if self.tokenizer is None:
+            if not language:
+                (
+                    language,
+                    language_probability,
+                    all_language_probs,
+                ) = self.model.detect_language(audio)
+            task = task or "transcribe"
+            self.tokenizer = Tokenizer(
+                self.model.hf_tokenizer,
+                self.model.model.is_multilingual,
+                task=task,
+                language=language,
+            )
+        else:
+            if task is not None:
+                self.tokenizer.task = self.tokenizer.tokenizer.token_to_id(
+                    f"<|{task}|>"
+                )
+
+            if language is not None:
+                self.tokenizer.language = self.tokenizer.tokenizer.token_to_id(
+                    f"<|{language}|>"
+                )
+                self.tokenizer.language_code = language
+
+        return language, language_probability, task, all_language_probs
+
     def transcribe(
         self,
         audio: Union[str, BinaryIO, np.ndarray],
         language: Optional[str] = None,
-        task: str = "transcribe",
+        task: str = None,
         log_progress: bool = False,
         beam_size: int = 5,
         best_of: int = 5,
@@ -288,8 +322,6 @@ def transcribe(
         hallucination_silence_threshold: Optional[float] = None,
         batch_size: int = 8,
         hotwords: Optional[str] = None,
-        language_detection_threshold: Optional[float] = 0.5,
-        language_detection_segments: int = 1,
     ) -> Tuple[Iterable[Segment], TranscriptionInfo]:
         """transcribe audio in chunks in batched fashion and return with language info.
 
@@ -337,9 +369,6 @@ def transcribe(
             batch_size: the maximum number of parallel requests to model for decoding.
             hotwords:
                 Hotwords/hint phrases to the model. Has no effect if prefix is not None.
-            language_detection_threshold: If the maximum probability of the language tokens is
-                higher than this value, the language is detected.
-            language_detection_segments: Number of segments to consider for the language detection.
 
         Unused Arguments
             compression_ratio_threshold: If the gzip compression ratio is above this value,
@@ -488,7 +517,7 @@ def transcribe(
             initial_prompt=initial_prompt,
             prefix=prefix,
             suppress_blank=suppress_blank,
-            suppress_tokens=get_suppressed_tokens(tokenizer, suppress_tokens),
+            suppress_tokens=get_suppressed_tokens(self.tokenizer, suppress_tokens),
             prepend_punctuations=prepend_punctuations,
             append_punctuations=append_punctuations,
             max_new_tokens=max_new_tokens,
@@ -513,10 +542,27 @@ def transcribe(
             vad_options=vad_parameters,
             all_language_probs=all_language_probs,
         )
+        
+        audio_chunks, chunks_metadata = collect_chunks(audio, clip_timestamps)
+        features = (
+            np.stack(
+                [
+                    pad_or_trim(
+                        self.model.feature_extractor(chunk)[
+                            ...,
+                            : chunk.shape[0] // self.model.feature_extractor.hop_length,
+                        ]
+                    )
+                    for chunk in audio_chunks
+                ]
+            )
+            if duration_after_vad
+            else []
+        )
 
         segments = self._batched_segments_generator(
             features,
-            tokenizer,
+            self.tokenizer,
             chunks_metadata,
             batch_size,
             options,
@@ -562,6 +608,9 @@ def _batched_segments_generator(
                 pbar.update(1)
 
         pbar.close()
+        # revert the tokenizer if multilingual inference is enabled
+        if self.preset_language is None:
+            self.tokenizer = None
         self.last_speech_timestamp = 0.0
 
 

From aededdf5ccf0b5766580ad6665abd908b53792d2 Mon Sep 17 00:00:00 2001
From: MahmoudAshraf97 <hassouna97.ma@gmail.com>
Date: Sat, 16 Nov 2024 13:03:25 +0200
Subject: [PATCH 6/8] . [skip ci]

---
 faster_whisper/transcribe.py | 57 +++++++++++++++++++++++++++++++++---
 1 file changed, 53 insertions(+), 4 deletions(-)

diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py
index 6ddf6e5f..7325a6c3 100644
--- a/faster_whisper/transcribe.py
+++ b/faster_whisper/transcribe.py
@@ -119,6 +119,58 @@ def __init__(
         self.model: WhisperModel = model
         self.last_speech_timestamp = 0.0
 
+    def forward(self, features, tokenizer, chunks_metadata, options):
+        encoder_output, outputs = self.generate_segment_batched(
+            features, tokenizer, options
+        )
+
+        segmented_outputs = []
+        segment_sizes = []
+        for chunk_metadata, output in zip(chunks_metadata, outputs):
+            duration = chunk_metadata["end_time"] - chunk_metadata["start_time"]
+            segment_size = int(ceil(duration) * self.model.frames_per_second)
+            segment_sizes.append(segment_size)
+            (
+                subsegments,
+                seek,
+                single_timestamp_ending,
+            ) = self.model._split_segments_by_timestamps(
+                tokenizer=tokenizer,
+                tokens=output["tokens"],
+                time_offset=chunk_metadata["start_time"],
+                segment_size=segment_size,
+                segment_duration=duration,
+                seek=0,
+            )
+            segmented_outputs.append(
+                [
+                    dict(
+                        text=tokenizer.decode(subsegment["tokens"]),
+                        avg_logprob=output["avg_logprob"],
+                        no_speech_prob=output["no_speech_prob"],
+                        tokens=subsegment["tokens"],
+                        start=subsegment["start"],
+                        end=subsegment["end"],
+                        compression_ratio=get_compression_ratio(
+                            tokenizer.decode(subsegment["tokens"])
+                        ),
+                    )
+                    for subsegment in subsegments
+                ]
+            )
+        if options.word_timestamps:
+            self.last_speech_timestamp = self.model.add_word_timestamps(
+                segmented_outputs,
+                tokenizer,
+                encoder_output,
+                segment_sizes,
+                options.prepend_punctuations,
+                options.append_punctuations,
+                self.last_speech_timestamp,
+            )
+
+        return segmented_outputs
+
     def generate_segment_batched(
         self,
         features: np.ndarray,
@@ -223,9 +275,6 @@ def forward(self, features, tokenizer, chunks_metadata, options):
                         compression_ratio=get_compression_ratio(
                             tokenizer.decode(subsegment["tokens"])
                         ),
-                        seek=int(
-                            chunk_metadata["start_time"] * self.model.frames_per_second
-                        ),
                     )
                     for subsegment in subsegments
                 ]
@@ -542,7 +591,7 @@ def transcribe(
             vad_options=vad_parameters,
             all_language_probs=all_language_probs,
         )
-        
+
         audio_chunks, chunks_metadata = collect_chunks(audio, clip_timestamps)
         features = (
             np.stack(

From 9ae92c049f2f2670726c200f66321e0a36fd1a57 Mon Sep 17 00:00:00 2001
From: MahmoudAshraf97 <hassouna97.ma@gmail.com>
Date: Sat, 16 Nov 2024 13:23:36 +0200
Subject: [PATCH 7/8] fix rebase [skip ci]

---
 faster_whisper/transcribe.py | 113 ++---------------------------------
 1 file changed, 6 insertions(+), 107 deletions(-)

diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py
index 7325a6c3..01c1e67e 100644
--- a/faster_whisper/transcribe.py
+++ b/faster_whisper/transcribe.py
@@ -240,97 +240,11 @@ def generate_segment_batched(
 
         return encoder_output, output
 
-    def forward(self, features, tokenizer, chunks_metadata, options):
-        encoder_output, outputs = self.generate_segment_batched(
-            features, tokenizer, options
-        )
-
-        segmented_outputs = []
-        segment_sizes = []
-        for chunk_metadata, output in zip(chunks_metadata, outputs):
-            duration = chunk_metadata["end_time"] - chunk_metadata["start_time"]
-            segment_size = int(ceil(duration) * self.model.frames_per_second)
-            segment_sizes.append(segment_size)
-            (
-                subsegments,
-                seek,
-                single_timestamp_ending,
-            ) = self.model._split_segments_by_timestamps(
-                tokenizer=tokenizer,
-                tokens=output["tokens"],
-                time_offset=chunk_metadata["start_time"],
-                segment_size=segment_size,
-                segment_duration=duration,
-                seek=0,
-            )
-            segmented_outputs.append(
-                [
-                    dict(
-                        text=tokenizer.decode(subsegment["tokens"]),
-                        avg_logprob=output["avg_logprob"],
-                        no_speech_prob=output["no_speech_prob"],
-                        tokens=subsegment["tokens"],
-                        start=subsegment["start"],
-                        end=subsegment["end"],
-                        compression_ratio=get_compression_ratio(
-                            tokenizer.decode(subsegment["tokens"])
-                        ),
-                    )
-                    for subsegment in subsegments
-                ]
-            )
-        if options.word_timestamps:
-            self.last_speech_timestamp = self.model.add_word_timestamps(
-                segmented_outputs,
-                tokenizer,
-                encoder_output,
-                segment_sizes,
-                options.prepend_punctuations,
-                options.append_punctuations,
-                self.last_speech_timestamp,
-            )
-
-        return segmented_outputs
-
-    def get_language_and_tokenizer(
-        self, audio, task: Optional[str] = None, language: Optional[str] = None
-    ):
-        all_language_probs = None
-        language_probability = 1.0
-
-        if self.tokenizer is None:
-            if not language:
-                (
-                    language,
-                    language_probability,
-                    all_language_probs,
-                ) = self.model.detect_language(audio)
-            task = task or "transcribe"
-            self.tokenizer = Tokenizer(
-                self.model.hf_tokenizer,
-                self.model.model.is_multilingual,
-                task=task,
-                language=language,
-            )
-        else:
-            if task is not None:
-                self.tokenizer.task = self.tokenizer.tokenizer.token_to_id(
-                    f"<|{task}|>"
-                )
-
-            if language is not None:
-                self.tokenizer.language = self.tokenizer.tokenizer.token_to_id(
-                    f"<|{language}|>"
-                )
-                self.tokenizer.language_code = language
-
-        return language, language_probability, task, all_language_probs
-
     def transcribe(
         self,
         audio: Union[str, BinaryIO, np.ndarray],
         language: Optional[str] = None,
-        task: str = None,
+        task: str = "transcribe",
         log_progress: bool = False,
         beam_size: int = 5,
         best_of: int = 5,
@@ -371,6 +285,8 @@ def transcribe(
         hallucination_silence_threshold: Optional[float] = None,
         batch_size: int = 8,
         hotwords: Optional[str] = None,
+        language_detection_threshold: Optional[float] = 0.5,
+        language_detection_segments: int = 1,
     ) -> Tuple[Iterable[Segment], TranscriptionInfo]:
         """transcribe audio in chunks in batched fashion and return with language info.
 
@@ -418,6 +334,9 @@ def transcribe(
             batch_size: the maximum number of parallel requests to model for decoding.
             hotwords:
                 Hotwords/hint phrases to the model. Has no effect if prefix is not None.
+            language_detection_threshold: If the maximum probability of the language tokens is
+                higher than this value, the language is detected.
+            language_detection_segments: Number of segments to consider for the language detection.
 
         Unused Arguments
             compression_ratio_threshold: If the gzip compression ratio is above this value,
@@ -592,23 +511,6 @@ def transcribe(
             all_language_probs=all_language_probs,
         )
 
-        audio_chunks, chunks_metadata = collect_chunks(audio, clip_timestamps)
-        features = (
-            np.stack(
-                [
-                    pad_or_trim(
-                        self.model.feature_extractor(chunk)[
-                            ...,
-                            : chunk.shape[0] // self.model.feature_extractor.hop_length,
-                        ]
-                    )
-                    for chunk in audio_chunks
-                ]
-            )
-            if duration_after_vad
-            else []
-        )
-
         segments = self._batched_segments_generator(
             features,
             self.tokenizer,
@@ -657,9 +559,6 @@ def _batched_segments_generator(
                 pbar.update(1)
 
         pbar.close()
-        # revert the tokenizer if multilingual inference is enabled
-        if self.preset_language is None:
-            self.tokenizer = None
         self.last_speech_timestamp = 0.0
 
 

From 07fb5758f4dc7e0a11496672848e0411df000d25 Mon Sep 17 00:00:00 2001
From: MahmoudAshraf97 <hassouna97.ma@gmail.com>
Date: Sat, 16 Nov 2024 13:28:57 +0200
Subject: [PATCH 8/8] fix rebase

---
 faster_whisper/transcribe.py | 63 ++++--------------------------------
 1 file changed, 6 insertions(+), 57 deletions(-)

diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py
index 01c1e67e..bf091981 100644
--- a/faster_whisper/transcribe.py
+++ b/faster_whisper/transcribe.py
@@ -154,6 +154,9 @@ def forward(self, features, tokenizer, chunks_metadata, options):
                         compression_ratio=get_compression_ratio(
                             tokenizer.decode(subsegment["tokens"])
                         ),
+                        seek=int(
+                            chunk_metadata["start_time"] * self.model.frames_per_second
+                        ),
                     )
                     for subsegment in subsegments
                 ]
@@ -455,7 +458,7 @@ def transcribe(
 
             language_probability = 1
 
-        self.tokenizer = Tokenizer(
+        tokenizer = Tokenizer(
             self.model.hf_tokenizer,
             self.model.model.is_multilingual,
             task=task,
@@ -485,7 +488,7 @@ def transcribe(
             initial_prompt=initial_prompt,
             prefix=prefix,
             suppress_blank=suppress_blank,
-            suppress_tokens=get_suppressed_tokens(self.tokenizer, suppress_tokens),
+            suppress_tokens=get_suppressed_tokens(tokenizer, suppress_tokens),
             prepend_punctuations=prepend_punctuations,
             append_punctuations=append_punctuations,
             max_new_tokens=max_new_tokens,
@@ -513,7 +516,7 @@ def transcribe(
 
         segments = self._batched_segments_generator(
             features,
-            self.tokenizer,
+            tokenizer,
             chunks_metadata,
             batch_size,
             options,
@@ -1738,60 +1741,6 @@ def find_alignment(
             )
         return return_list
 
-    def generate_segment_batched(
-        self,
-        features: np.ndarray,
-        tokenizer: Tokenizer,
-        options: TranscriptionOptions,
-    ):
-        batch_size = features.shape[0]
-
-        prompt = self.get_prompt(
-            tokenizer,
-            previous_tokens=(
-                tokenizer.encode(options.initial_prompt)
-                if options.initial_prompt is not None
-                else []
-            ),
-            without_timestamps=options.without_timestamps,
-            prefix=options.prefix,
-            hotwords=options.hotwords,
-        )
-
-        encoder_output = self.encode(features)
-
-        results = self.model.generate(
-            encoder_output,
-            [prompt] * batch_size,
-            beam_size=options.beam_size,
-            patience=options.patience,
-            length_penalty=options.length_penalty,
-            max_length=self.max_length,
-            suppress_blank=options.suppress_blank,
-            suppress_tokens=options.suppress_tokens,
-            return_scores=True,
-            return_no_speech_prob=True,
-            sampling_temperature=options.temperatures[0],
-            repetition_penalty=options.repetition_penalty,
-            no_repeat_ngram_size=options.no_repeat_ngram_size,
-        )
-
-        output = []
-        for result in results:
-            # return scores
-            seq_len = len(result.sequences_ids[0])
-            cum_logprob = result.scores[0] * (seq_len**options.length_penalty)
-
-            output.append(
-                dict(
-                    avg_logprob=cum_logprob / (seq_len + 1),
-                    no_speech_prob=result.no_speech_prob,
-                    tokens=result.sequences_ids[0],
-                )
-            )
-
-        return encoder_output, output
-
     def detect_language(
         self,
         audio: Optional[np.ndarray] = None,