From f3cd20fbf3d665326a36e825acc0beeea379c26a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jes=C3=BAs=20Hormigo?= Date: Sat, 9 Dec 2023 21:24:55 +0100 Subject: [PATCH 1/3] Support for HLS transcription (resolves #62) --- whisper_live/client.py | 49 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 46 insertions(+), 3 deletions(-) diff --git a/whisper_live/client.py b/whisper_live/client.py index ca2300c4..4ab30fa6 100644 --- a/whisper_live/client.py +++ b/whisper_live/client.py @@ -11,6 +11,7 @@ import websocket import uuid import time +import subprocess def resample(file: str, sr: int = 16000): @@ -344,6 +345,46 @@ def write_audio_frames_to_file(self, frames, file_name): wavfile.setframerate(self.rate) wavfile.writeframes(frames) + def process_hls_stream(self, hls_url): + """ + Connect to an HLS source, process the audio stream, and send it for transcription. + + Args: + hls_url (str): The URL of the HLS stream source. + """ + print("[INFO]: Connecting to HLS stream...") + process = None # Initialize process to None + + + try: + # Launch an FFMPEG process to connect to the HLS stream + command = [ + 'ffmpeg', + '-i', hls_url, # Input URL + '-acodec', 'pcm_s16le', # Output codec + '-f', 's16le', # Output format + '-ac', '1', # Set audio channels to 1 (mono) + '-ar', str(self.rate), # Resample audio to the specified rate + '-' + ] + process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + + # Process the stream + while True: + in_bytes = process.stdout.read(self.chunk * 2) # 2 bytes per sample + if not in_bytes: + break + audio_array = self.bytes_to_float_array(in_bytes) + self.send_packet_to_server(audio_array.tobytes()) + + except Exception as e: + print(f"[ERROR]: Failed to connect to HLS stream: {e}") + finally: + if process: + process.kill() + + print("[INFO]: HLS stream processing finished.") + def record(self, out_file="output_recording.wav"): """ Record audio data from the input stream and save it to a WAV file. @@ -464,7 +505,7 @@ class TranscriptionClient: def __init__(self, host, port, is_multilingual=False, lang=None, translate=False): self.client = Client(host, port, is_multilingual, lang, translate) - def __call__(self, audio=None): + def __call__(self, audio=None, hls_url=None): """ Start the transcription process. @@ -483,8 +524,10 @@ def __call__(self, audio=None): return pass print("[INFO]: Server Ready!") - if audio is not None: + if hls_url is not None: + self.client.process_hls_stream(hls_url) + elif audio is not None: resampled_file = resample(audio) self.client.play_file(resampled_file) else: - self.client.record() + self.client.record() \ No newline at end of file From b6dee4e46e96dbbe091601d11dec3840ceb714d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jes=C3=BAs=20Hormigo?= Date: Sun, 10 Dec 2023 19:34:27 +0100 Subject: [PATCH 2/3] Using ffmpeg-python package instead of requiring having ffmpeg installed in system --- README.md | 6 ++++++ whisper_live/client.py | 21 ++++++++------------- 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index f1e15428..3977c476 100644 --- a/README.md +++ b/README.md @@ -41,6 +41,12 @@ Unlike traditional speech recognition systems that rely on continuous audio stre ``` This command captures audio from the microphone and sends it to the server for transcription. It uses the same options as the previous command, enabling the multilingual feature and specifying the target language and task. + - To trasncribe from a HLS stream: + ```python + client = TranscriptionClient(host, port, is_multilingual=True, lang="en", translate=False) + client(hls_url="http://domain.url/playlist.m3u8") + ``` + This command streams audio into the server from a HLS stream. It uses the same options as the previous command, enabling the multilingual feature and specifying the target language and task. ## Transcribe audio from browser - Run the server diff --git a/whisper_live/client.py b/whisper_live/client.py index 4ab30fa6..070845db 100644 --- a/whisper_live/client.py +++ b/whisper_live/client.py @@ -11,7 +11,6 @@ import websocket import uuid import time -import subprocess def resample(file: str, sr: int = 16000): @@ -355,19 +354,14 @@ def process_hls_stream(self, hls_url): print("[INFO]: Connecting to HLS stream...") process = None # Initialize process to None - try: - # Launch an FFMPEG process to connect to the HLS stream - command = [ - 'ffmpeg', - '-i', hls_url, # Input URL - '-acodec', 'pcm_s16le', # Output codec - '-f', 's16le', # Output format - '-ac', '1', # Set audio channels to 1 (mono) - '-ar', str(self.rate), # Resample audio to the specified rate - '-' - ] - process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + # Connecting to the HLS stream using ffmpeg-python + process = ( + ffmpeg + .input(hls_url, threads=0) + .output('-', format='s16le', acodec='pcm_s16le', ac=1, ar=self.rate) + .run_async(pipe_stdout=True, pipe_stderr=True) + ) # Process the stream while True: @@ -385,6 +379,7 @@ def process_hls_stream(self, hls_url): print("[INFO]: HLS stream processing finished.") + def record(self, out_file="output_recording.wav"): """ Record audio data from the input stream and save it to a WAV file. From a1a8d5f92a2dd769a4e4bf4fd2d2dd7f720a162f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jes=C3=BAs=20Hormigo?= Date: Tue, 12 Dec 2023 13:46:52 +0100 Subject: [PATCH 3/3] Added a HLS stream sample URL --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3977c476..69e20e3b 100644 --- a/README.md +++ b/README.md @@ -44,7 +44,7 @@ Unlike traditional speech recognition systems that rely on continuous audio stre - To trasncribe from a HLS stream: ```python client = TranscriptionClient(host, port, is_multilingual=True, lang="en", translate=False) - client(hls_url="http://domain.url/playlist.m3u8") + client(hls_url="http://as-hls-ww-live.akamaized.net/pool_904/live/ww/bbc_1xtra/bbc_1xtra.isml/bbc_1xtra-audio%3d96000.norewind.m3u8") ``` This command streams audio into the server from a HLS stream. It uses the same options as the previous command, enabling the multilingual feature and specifying the target language and task.