From 60845d61a9c99a9e1d6424bc3f05d80ee5443213 Mon Sep 17 00:00:00 2001 From: Chafid Ahmad Date: Sun, 22 Sep 2024 17:29:29 +0700 Subject: [PATCH 1/2] initial commit --- .gitignore | 1 + examples/test-google-stream.py | 54 ++++++++++++++++------------ examples/test-sherpaonnx.py | 2 +- tts_wrapper/engines/google/google.py | 8 ++++- tts_wrapper/tts.py | 6 ++++ 5 files changed, 47 insertions(+), 24 deletions(-) diff --git a/.gitignore b/.gitignore index 0deceeb8..9f8f58ec 100644 --- a/.gitignore +++ b/.gitignore @@ -29,3 +29,4 @@ cov.xml credentials-private.json examples/*.wav examples/*.mp3 +examples/ttsandtranslate-7dd2e2d80d42.json diff --git a/examples/test-google-stream.py b/examples/test-google-stream.py index e6d3da0e..6a9eb3fe 100644 --- a/examples/test-google-stream.py +++ b/examples/test-google-stream.py @@ -40,31 +40,41 @@ def main(): ) logging.info(f"Text to synthesize: {text}") - # Test synth_to_bytestream method - output_file_bytestream = "output_streamed_google.wav" # Change to 'mp3' or 'flac' as needed - audio_format = "wav" # Supported formats: 'wav', 'mp3', 'flac' - - if audio_format.lower() == 'wav': - # Initialize WAV file - with wave.open(output_file_bytestream, 'wb') as wf: - wf.setnchannels(1) # Mono - wf.setsampwidth(2) # 16-bit PCM - wf.setframerate(tts.audio_rate) - logging.info(f"Starting synthesis and streaming to {output_file_bytestream} in {audio_format} format.") - - for chunk_idx, audio_chunk in enumerate(tts.synth_to_bytestream(text, format=audio_format)): - logging.info(f"Received audio chunk {chunk_idx} with size {len(audio_chunk)} bytes") - wf.writeframes(audio_chunk) # Write PCM frames to WAV file - - logging.info(f"Audio successfully saved to {output_file_bytestream} in {audio_format} format via synth_to_bytestream.") - - else: - # Handle non-WAV formats if implemented - pass - + ## Test synth_to_bytestream method + #output_file_bytestream = "output_streamed_google.wav" # Change to 'mp3' or 'flac' as needed + #audio_format = "wav" # Supported formats: 'wav', 'mp3', 'flac' + # + #if audio_format.lower() == 'wav': + # # Initialize WAV file + # with wave.open(output_file_bytestream, 'wb') as wf: + # wf.setnchannels(1) # Mono + # wf.setsampwidth(2) # 16-bit PCM + # wf.setframerate(tts.audio_rate) + # logging.info(f"Starting synthesis and streaming to {output_file_bytestream} in {audio_format} format.") + # + # for chunk_idx, audio_chunk in enumerate(tts.synth_to_bytestream(text, format=audio_format)): + # logging.info(f"Received audio chunk {chunk_idx} with size {len(audio_chunk)} bytes") + # wf.writeframes(audio_chunk) # Write PCM frames to WAV file + # + # logging.info(f"Audio successfully saved to {output_file_bytestream} in {audio_format} format via synth_to_bytestream.") + # + #else: + # # Handle non-WAV formats if implemented + # pass + # # Test speak_streamed method output_file_speak_streamed = "output_speak_streamed_google.wav" tts.speak_streamed(text) + # Pause playback after 5 seconds + # time.sleep(2) + tts.pause_playback() + print("Playback paused.") + + # Resume playback after 3 seconds + time.sleep(3) + tts.resume_playback() + print("Playback resumed.") + logging.info(f"Audio successfully saved to {output_file_speak_streamed} in wav format via speak_streamed.") except Exception as e: diff --git a/examples/test-sherpaonnx.py b/examples/test-sherpaonnx.py index 3182cbd9..32d4e187 100755 --- a/examples/test-sherpaonnx.py +++ b/examples/test-sherpaonnx.py @@ -47,7 +47,7 @@ def main(): f.write(audio_chunk) # Write the chunk to the file logging.info(f"Audio successfully saved to {output_file} in {audio_format} format.") - + tts.speak_streamed(text) except Exception as e: logging.error(f"An error occurred during synthesis: {e}") diff --git a/tts_wrapper/engines/google/google.py b/tts_wrapper/engines/google/google.py index d93c3a88..9d17a07f 100644 --- a/tts_wrapper/engines/google/google.py +++ b/tts_wrapper/engines/google/google.py @@ -35,6 +35,7 @@ def __init__( self.audio_started = False self.audio_stopped = False self.audio_killed = False + self.is_paused = False # Audio playback callback, called continuously to stream audio from the buffer def play_audio_callback( @@ -273,7 +274,6 @@ def speak_streamed( logging.info( f"Processing audio chunk {chunk_idx} with size {len(audio_chunk)} bytes" ) - if audio_format.lower() == "wav": # Convert bytes back to numpy float32 array for playback # Assuming audio_chunk is raw PCM data (LINEAR16) @@ -325,6 +325,12 @@ def speak_streamed( f"Audio successfully saved to {save_to_file_path} in {audio_format} format." ) + def pause_playback(self): + is_paused = True + + def resume_playback(self): + is_paused = False + def play_audio(self): """ Plays audio from the audio_buffer using sounddevice. diff --git a/tts_wrapper/tts.py b/tts_wrapper/tts.py index f56c1412..f132356e 100644 --- a/tts_wrapper/tts.py +++ b/tts_wrapper/tts.py @@ -446,3 +446,9 @@ def _convert_to_ssml(self, text: str) -> str: ssml_parts.append(f'{word}') ssml_parts.append("") return " ".join(ssml_parts) + + def pause_playback(self): + pass + + def resume_playback(self): + pass \ No newline at end of file From 0cdc177bcb30c8ddc0b435860ac6233834612ebd Mon Sep 17 00:00:00 2001 From: Chafid Ahmad Date: Tue, 24 Sep 2024 17:09:31 +0700 Subject: [PATCH 2/2] adding speak_streamed and related functions too all engines --- examples/test-eleven.py | 10 +- examples/test-google.py | 2 +- examples/test-googleTrans.py | 2 +- examples/test-mms.py | 2 +- examples/test-pico.py | 53 ++++ examples/test-polly.py | 55 +++-- examples/test-sapi.py | 64 +++++ examples/test-uwp.py | 54 ++++ examples/test-watson.py | 79 +++--- examples/test-witai.py | 11 +- tts_wrapper/engines/__init__.py | 10 +- tts_wrapper/engines/elevenlabs/elevenlabs.py | 227 ++++++++++++++++- tts_wrapper/engines/google/google.py | 6 - .../engines/googletrans/googletrans.py | 229 ++++++++++++++++- tts_wrapper/engines/microsoft/microsoft.py | 226 ++++++++++++++++- tts_wrapper/engines/mms/mms.py | 224 ++++++++++++++++- tts_wrapper/engines/pico/pico.py | 224 ++++++++++++++++- tts_wrapper/engines/polly/polly.py | 230 +++++++++++++++++- tts_wrapper/engines/sapi/sapi.py | 229 ++++++++++++++++- tts_wrapper/engines/watson/watson.py | 229 ++++++++++++++++- tts_wrapper/engines/witai/witai.py | 229 ++++++++++++++++- tts_wrapper/tts.py | 1 + 22 files changed, 2297 insertions(+), 99 deletions(-) create mode 100644 examples/test-pico.py create mode 100644 examples/test-sapi.py create mode 100644 examples/test-uwp.py diff --git a/examples/test-eleven.py b/examples/test-eleven.py index 5c701347..962dc342 100644 --- a/examples/test-eleven.py +++ b/examples/test-eleven.py @@ -8,12 +8,14 @@ client = ElevenLabsClient(credentials=(os.getenv('ELEVENLABS_API_KEY'))) tts = ElevenLabsTTS(client) -print(client.get_voices()) +#print(client.get_voices()) # # # pausing try: ssml_text = tts.ssml.add( - "This is me speaking with Speak function and ElevenLabs" + "This is me speaking with Speak function and ElevenLabs. I should be hearing a sentence" ) + print ("SSML TEXT") + print(ssml_text) tts.speak_streamed(ssml_text) # Pause after 5 seconds time.sleep(0.3) @@ -21,11 +23,11 @@ print("Pausing..") # Resume after 3 seconds time.sleep(0.5) - tts.resume_audio() + #tts.resume_audio() print("Resuming") # Stop after 2 seconds time.sleep(1) - tts.stop_audio() + #tts.stop_audio() print("Stopping.") except Exception as e: print(f"Error at pausing: {e}") diff --git a/examples/test-google.py b/examples/test-google.py index 41c43012..e5d2f4b1 100644 --- a/examples/test-google.py +++ b/examples/test-google.py @@ -25,7 +25,7 @@ # print("Resuming") # # Stop after 2 seconds # time.sleep(1) -# tts.stop_audio() + tts.stop_audio() # print("Stopping.") except Exception as e: print(f"Error at pausing: {e}") diff --git a/examples/test-googleTrans.py b/examples/test-googleTrans.py index b5a5742f..14b28620 100644 --- a/examples/test-googleTrans.py +++ b/examples/test-googleTrans.py @@ -17,7 +17,7 @@ # Define the text to be synthesized text = "Hello, This is a word timing test" start_time = time.time() - tts.speak(text) + tts.speak_streamed(text) synthesis_time = time.time() print(f"Synthesis time: {synthesis_time - start_time:.3f} seconds") text = "Hello, This is a word timing test" diff --git a/examples/test-mms.py b/examples/test-mms.py index b86fec3d..0975ca58 100644 --- a/examples/test-mms.py +++ b/examples/test-mms.py @@ -44,7 +44,7 @@ ssml_text = tts.ssml.add(text_with_prosody) print("ssml_text", ssml_text) - tts.speak(ssml_text) + tts.speak_streamed(ssml_text) time.sleep(0.5) print("save to file") diff --git a/examples/test-pico.py b/examples/test-pico.py new file mode 100644 index 00000000..08404617 --- /dev/null +++ b/examples/test-pico.py @@ -0,0 +1,53 @@ +from tts_wrapper import PicoTTS, PicoClient +import json +import time +from pathlib import Path +import os + +# Initialize the client with only the lang parameter +client = PicoClient() +tts = PicoTTS(client) +text = "hello world i like monkeys" +tts.speak_streamed(text) + +print(text) + +# volume control test +print("Volume setting is from 0-100") +text_read = "" +try: + tts.set_property("volume", "50") + print("Setting volume at 50") + text_read = f"The current volume is at fifty" + text_with_prosody = tts.construct_prosody_tag(text_read) + ssml_text = tts.ssml.add(text_with_prosody) + print("ssml_text", ssml_text) + tts.speak(ssml_text) + time.sleep(0.5) + + #clear ssml so the previous text is not repeated + + tts.set_property("volume", "100") + print("Setting volume at 100") + text_read = f"The current volume is at a hundred" + text_with_prosody = tts.construct_prosody_tag(text_read) + ssml_text = tts.ssml.add(text_with_prosody) + print("ssml_text", ssml_text) + + tts.speak(ssml_text) + time.sleep(0.5) + + tts.set_property("volume", "10") + print("Setting volume at 10") + text_read = f"The current volume is at ten" + text_with_prosody = tts.construct_prosody_tag(text_read) + ssml_text = tts.ssml.add(text_with_prosody) + print("ssml_text", ssml_text) + + tts.speak(ssml_text) + time.sleep(0.5) + + print("save to file") + tts.synth_to_file(ssml_text, "pico_output.wav", "wav") +except Exception as e: + print(f"Error at setting volume: {e}") \ No newline at end of file diff --git a/examples/test-polly.py b/examples/test-polly.py index f9ea227d..283ab3bb 100644 --- a/examples/test-polly.py +++ b/examples/test-polly.py @@ -87,7 +87,8 @@ def on_end(): tts.connect('onStart', on_start) tts.connect('onEnd', on_end) print(tts) - tts.start_playback_with_callbacks(text, callback=my_callback) +# tts.start_playback_with_callbacks(text, callback=my_callback) + tts.speak_streamed(text) print("save to file") tts.synth_to_file(text, "polly_output.wav", "wav") except Exception as e: @@ -96,33 +97,33 @@ def on_end(): # volume control test # print("Volume setting is from 0-100") # text_read = "" -# try: -# tts.set_property("volume", "50") -# print("Setting volume at 50") -# text_read = f"The current volume is at 50" -# text_with_prosody = tts.construct_prosody_tag(text_read) -# ssml_text = tts.ssml.add(text_with_prosody) -# tts.speak_streamed(ssml_text) -# time.sleep(5) +try: + tts.set_property("volume", "50") + print("Setting volume at 50") + text_read = f"The current volume is at 50" + text_with_prosody = tts.construct_prosody_tag(text_read) + ssml_text = tts.ssml.add(text_with_prosody) + tts.speak_streamed(ssml_text) + time.sleep(5) # # #clear ssml so the previous text is not repeated -# tts.ssml.clear_ssml() -# tts.set_property("volume", "100") -# print("Setting volume at 100") -# text_read = f"The current volume is at 100" -# text_with_prosody = tts.construct_prosody_tag(text_read) -# ssml_text = tts.ssml.add(text_with_prosody) -# tts.speak_streamed(ssml_text) -# time.sleep(5) + tts.ssml.clear_ssml() + tts.set_property("volume", "100") + print("Setting volume at 100") + text_read = f"The current volume is at 100" + text_with_prosody = tts.construct_prosody_tag(text_read) + ssml_text = tts.ssml.add(text_with_prosody) + tts.speak_streamed(ssml_text) + time.sleep(5) # -# tts.ssml.clear_ssml() -# tts.set_property("volume", "10") -# print("Setting volume at 10") -# text_read = f"The current volume is at 10" -# text_with_prosody = tts.construct_prosody_tag(text_read) -# ssml_text = tts.ssml.add(text_with_prosody) -# tts.speak_streamed(ssml_text) -# time.sleep(5) + tts.ssml.clear_ssml() + tts.set_property("volume", "10") + print("Setting volume at 10") + text_read = f"The current volume is at 10" + text_with_prosody = tts.construct_prosody_tag(text_read) + ssml_text = tts.ssml.add(text_with_prosody) + tts.speak_streamed(ssml_text) + time.sleep(5) # -# except Exception as e: -# print(f"Error at setting volume: {e}") +except Exception as e: + print(f"Error at setting volume: {e}") diff --git a/examples/test-sapi.py b/examples/test-sapi.py new file mode 100644 index 00000000..6730ffaa --- /dev/null +++ b/examples/test-sapi.py @@ -0,0 +1,64 @@ +from tts_wrapper import SAPITTS, SAPIClient, SAPISSML +import json +import time +from pathlib import Path +import os + +# Initialize the client with only the lang parameter +client = SAPIClient() +tts = SAPITTS(client) +text = "hello world i like monkeys" +tts.speak_streamed(text) + +print(text) + +# volume control test +print("Volume setting is from 0-100") +text_read = "" +try: + tts.set_property("volume", "50") + print("Setting volume at 50") + text_read = f"The current volume is at fifty" + text_with_prosody = tts.construct_prosody_tag(text_read) + ssml_text = tts.ssml.add(text_with_prosody) + print("ssml_text", ssml_text) + tts.speak_streamed(ssml_text) + time.sleep(0.5) + + #clear ssml so the previous text is not repeated + + tts.set_property("volume", "100") + print("Setting volume at 100") + text_read = f"The current volume is at a hundred" + text_with_prosody = tts.construct_prosody_tag(text_read) + ssml_text = tts.ssml.add(text_with_prosody) + print("ssml_text", ssml_text) + + tts.speak_streamed(ssml_text) + time.sleep(0.5) + + tts.set_property("volume", "10") + print("Setting volume at 10") + text_read = f"The current volume is at ten" + text_with_prosody = tts.construct_prosody_tag(text_read) + ssml_text = tts.ssml.add(text_with_prosody) + print("ssml_text", ssml_text) + + tts.speak_streamed(ssml_text) + time.sleep(0.5) + + print("save to file") + tts.synth_to_file(ssml_text, "mms_output.wav", "wav") +except Exception as e: + print(f"Error at setting volume: {e}") + +# # Demonstrate saving audio to a file +try: + ssml_text = tts.ssml.add(f"This is me speaking with for save to file function and SAPI text to speech") + output_file = Path(f"output_sapi.mp3") + tts.synth_to_file(ssml_text, str(output_file), format='mp3') +# # or you could do + #tts.speak(ssml_text) + print(f"Audio content saved to {output_file}") +except Exception as e: + print(f"Error at saving: {e}") \ No newline at end of file diff --git a/examples/test-uwp.py b/examples/test-uwp.py new file mode 100644 index 00000000..0c474154 --- /dev/null +++ b/examples/test-uwp.py @@ -0,0 +1,54 @@ +from tts_wrapper import UWPTTS, UWPClient +import json +import time +from pathlib import Path +import os + +# Initialize the client with only the lang parameter +client = UWPClient() +tts = UWPTTS(client) +text = "hello world i like monkeys" +#print(tts.get_voices()) +tts.speak_streamed(text) + +print(text) + +# volume control test +#print("Volume setting is from 0-100") +#text_read = "" +#try: +# tts.set_property("volume", "50") +# print("Setting volume at 50") +# text_read = f"The current volume is at fifty" +# text_with_prosody = tts.construct_prosody_tag(text_read) +# ssml_text = tts.ssml.add(text_with_prosody) +# print("ssml_text", ssml_text) +# tts.speak(ssml_text) +# time.sleep(0.5) + + #clear ssml so the previous text is not repeated + +# tts.set_property("volume", "100") +# print("Setting volume at 100") +# text_read = f"The current volume is at a hundred" +# text_with_prosody = tts.construct_prosody_tag(text_read) +# ssml_text = tts.ssml.add(text_with_prosody) +# print("ssml_text", ssml_text) + +# tts.speak(ssml_text) +# time.sleep(0.5) + +# tts.set_property("volume", "10") +# print("Setting volume at 10") +# text_read = f"The current volume is at ten" +# text_with_prosody = tts.construct_prosody_tag(text_read) +# ssml_text = tts.ssml.add(text_with_prosody) +# print("ssml_text", ssml_text) + +# tts.speak(ssml_text) +# time.sleep(0.5) + +# print("save to file") +# tts.synth_to_file(ssml_text, "mms_output.wav", "wav") +#except Exception as e: +# print(f"Error at setting volume: {e}") \ No newline at end of file diff --git a/examples/test-watson.py b/examples/test-watson.py index 8422abf1..f9767284 100644 --- a/examples/test-watson.py +++ b/examples/test-watson.py @@ -13,26 +13,27 @@ client = WatsonClient(credentials=(api_key, region, instance_id)) tts = WatsonTTS(client=client) -# print(client.get_voices()) +print(client.get_voices()) # # # pausing -# try: -# ssml_text = tts.ssml.add(f"This is me speaking with Speak function and ElevenLabs") -# tts.speak_streamed(ssml_text) -# # Pause after 5 seconds -# time.sleep(0.3) -# tts.pause_audio() -# print("Pausing..") -# # Resume after 3 seconds -# time.sleep(0.5) -# tts.resume_audio() -# print("Resuming") -# # Stop after 2 seconds -# time.sleep(1) -# tts.stop_audio() -# print("Stopping.") -# except Exception as e: -# print(f"Error at pausing: {e}") -# +try: + ssml_text = tts.ssml.add(f"This is me speaking with Speak function and ElevenLabs") + tts.speak_streamed(ssml_text) + print (ssml_text) + # Pause after 5 seconds + #time.sleep(0.3) + #tts.pause_audio() + #print("Pausing..") + # Resume after 3 seconds + #time.sleep(0.5) + #tts.resume_audio() + print("Resuming") + # Stop after 2 seconds + #time.sleep(1) + #tts.stop_audio() + print("Stopping.") +except Exception as e: + print(f"Error at pausing: {e}") + # time.sleep(3) # # # Demonstrate saving audio to a file @@ -79,26 +80,26 @@ # ## calbacks -def my_callback(word: str, start_time: float, end_time: float): - duration = end_time - start_time - print(f"Word: {word}, Duration: {duration:.3f}s") - -def on_start(): - print('Speech started') - -def on_end(): - print('Speech ended') - -try: - text = "Hello, This is a word timing test" - tts.connect('onStart', on_start) - tts.connect('onEnd', on_end) - tts.start_playback_with_callbacks(text, callback=my_callback) -except Exception as e: - print(f"Error at callbacks: {e}") - -time.sleep(3) - +#def my_callback(word: str, start_time: float, end_time: float): +# duration = end_time - start_time +# print(f"Word: {word}, Duration: {duration:.3f}s") +# +#def on_start(): +# print('Speech started') +# +#def on_end(): +# print('Speech ended') +# +#try: +# text = "Hello, This is a word timing test" +# tts.connect('onStart', on_start) +# tts.connect('onEnd', on_end) +# tts.start_playback_with_callbacks(text, callback=my_callback) +#except Exception as e: +# print(f"Error at callbacks: {e}") +# +#time.sleep(3) +# try: text = "Test saving audio to file" print(text) diff --git a/examples/test-witai.py b/examples/test-witai.py index 460f00ee..64016b36 100644 --- a/examples/test-witai.py +++ b/examples/test-witai.py @@ -3,6 +3,7 @@ import os import os from load_credentials import load_credentials + # Load credentials load_credentials('credentials.json') @@ -44,10 +45,12 @@ def on_end(): print('Speech ended') try: - text = "Hello, This is a word timing test" - tts.connect('onStart', on_start) - tts.connect('onEnd', on_end) - tts.start_playback_with_callbacks(text, callback=my_callback) + #text = "Hello, This is a word timing test" + #tts.connect('onStart', on_start) + #tts.connect('onEnd', on_end) + #tts.start_playback_with_callbacks(text, callback=my_callback) + text = "This is a speak streamed function test using WITAI" + tts.speak_streamed(text) except Exception as e: print(f"Error at callbacks: {e}") diff --git a/tts_wrapper/engines/__init__.py b/tts_wrapper/engines/__init__.py index 1be424ae..66f3bbca 100644 --- a/tts_wrapper/engines/__init__.py +++ b/tts_wrapper/engines/__init__.py @@ -1,10 +1,10 @@ import sys -from .google import * -from .microsoft import * +from .google import * # +from .microsoft import * # from .pico import * -from .polly import * -from .sapi import * -from .watson import * +from .polly import * # +from .sapi import * # +from .watson import * from .elevenlabs import * from .uwp import * from .witai import * diff --git a/tts_wrapper/engines/elevenlabs/elevenlabs.py b/tts_wrapper/engines/elevenlabs/elevenlabs.py index f2b80f6b..1b7f225e 100644 --- a/tts_wrapper/engines/elevenlabs/elevenlabs.py +++ b/tts_wrapper/engines/elevenlabs/elevenlabs.py @@ -1,10 +1,16 @@ -from typing import Any, List, Dict, Optional, Tuple +from typing import Any, List, Dict, Optional, Tuple, Generator from ...exceptions import UnsupportedFileFormat from ...tts import AbstractTTS, FileFormat from . import ElevenLabsClient, ElevenLabsSSMLRoot import re import numpy as np import pathlib +import logging +import threading +import queue +import sounddevice as sd +import time +from io import BytesIO class ElevenLabsTTS(AbstractTTS): @@ -18,6 +24,12 @@ def __init__( self._client = client self.audio_rate = 22050 # Kept at 22050 self.set_voice(voice or "yoZ06aMxZJJ28mfd3POQ", lang or "en-US") + self.audio_buffer = queue.Queue() + self.playback_finished = threading.Event() + self.audio_started = False + self.audio_stopped = False + self.audio_killed = False + self.audio_format = "wav" # Default format def synth_to_bytes(self, text: Any) -> bytes: if not self._voice: @@ -49,6 +61,46 @@ def get_audio_duration(self) -> float: return num_samples / self.audio_rate return 0.0 + def _split_text(self, text: str) -> List[str]: + # Simple sentence splitter based on punctuation. + sentences = re.split(r"(?<=[.!?]) +", text) + return sentences + + # Audio playback callback, called continuously to stream audio from the buffer + def play_audio_callback( + self, outdata: np.ndarray, frames: int, time_info, status: sd.CallbackFlags + ): + if self.audio_killed or ( + self.audio_started and self.audio_buffer.empty() and self.audio_stopped + ): + logging.error("AUDIO KILLED OR STOPPED OR BUFFER EMPTY") + self.playback_finished.set() + return + + if self.audio_buffer.empty(): + outdata.fill(0) + return + + n = 0 + while n < frames and not self.audio_buffer.empty(): + remaining = frames - n + current_chunk = self.audio_buffer.queue[0] + k = current_chunk.shape[0] + + if remaining <= k: + outdata[n:, 0] = current_chunk[:remaining] + self.audio_buffer.queue[0] = current_chunk[remaining:] + n = frames + if self.audio_buffer.queue[0].shape[0] == 0: + self.audio_buffer.get() + break + + outdata[n : n + k, 0] = self.audio_buffer.get() + n += k + + if n < frames: + outdata[n:, 0] = 0 + def adjust_volume_value(self, generated_audio: bytes, volume: float) -> bytes: # check if generated audio length is odd. If it is, add an empty byte since np.frombuffer is expecting # an even length @@ -122,3 +174,176 @@ def set_voice(self, voice_id: str, lang_id: str = None): self._voice = voice_id # NB: Lang doesnt do much for ElevenLabs self._lang = lang_id + + + def synth_to_bytestream( + self, text: Any, format: Optional[str] = "wav" + ) -> Generator[bytes, None, None]: + """ + Synthesizes text to an in-memory bytestream in the specified audio format. + Yields audio data chunks as they are generated. + + :param text: The text to synthesize. + :param format: The desired audio format (e.g., 'wav', 'mp3', 'flac'). Defaults to 'wav'. + :return: A generator yielding bytes objects containing audio data. + """ + try: + logging.info(f"[ElevenLabs.synth_to_bytestream] Synthesizing text: {text}") + + # Split the text into smaller segments (e.g., sentences) for incremental synthesis + + text_segments = self._split_text(str(text)) + + for segment_idx, segment in enumerate(text_segments): + logging.info(f"Synthesizing segment {segment_idx}: {segment}") + self.generated_audio, word_timings = self._client.synth(str(text), self._voice) + self.set_timings(word_timings) + + audio_bytes = self.generated_audio + + if format.lower() == "wav": + # Yield raw PCM data (skip WAV header if necessary) + # Google TTS returns LINEAR16 PCM in WAV format + audio_stream = BytesIO(audio_bytes) + audio_stream.seek(44) # Skip the 44-byte WAV header + chunk_size = 1024 # Number of bytes per chunk + + while True: + chunk = audio_stream.read(chunk_size) + if not chunk: + break + yield chunk + + elif format.lower() in ["mp3", "flac"]: + # Convert PCM to the desired format using _convert_audio + pcm_data = np.frombuffer(audio_bytes, dtype=np.int16) + converted_audio = self._convert_audio( + pcm_data, format, self.audio_rate + ) + chunk_size = 4096 # Number of bytes per chunk + audio_io = BytesIO(converted_audio) + + while True: + chunk = audio_io.read(chunk_size) + if not chunk: + break + yield chunk + + else: + raise UnsupportedFileFormat(f"Unsupported format: {format}") + + except Exception as e: + logging.error(f"Error in synth_to_bytestream: {e}") + raise + + def speak_streamed( + self, + text: str, + save_to_file_path: Optional[str] = None, + audio_format: Optional[str] = "wav", + ) -> None: + """ + Synthesizes text and plays it back using sounddevice in a streaming fashion. + Optionally saves the audio to a file after playback completes. + + :param text: The text to synthesize and play. + :param save_to_file_path: Path to save the audio file (optional). + :param audio_format: Audio format to save (e.g., 'wav', 'mp3', 'flac'). + """ + logging.info( + "[ElevenLabs.speak_streamed] Starting speech synthesis and playback..." + ) + + # Reset flags + self.audio_started = False + self.audio_stopped = False + self.playback_finished.clear() + + # Open the output file if saving is required + output_file = None + if save_to_file_path: + output_file = open(save_to_file_path, "wb") + logging.info( + f"Saving audio to {save_to_file_path} in {audio_format} format." + ) + + try: + # Start audio playback in a separate thread + playback_thread = threading.Thread(target=self.play_audio) + playback_thread.start() + + # Iterate over the generator returned by synth_to_bytestream + for chunk_idx, audio_chunk in enumerate( + self.synth_to_bytestream(text, format=audio_format) + ): + logging.info( + f"Processing audio chunk {chunk_idx} with size {len(audio_chunk)} bytes" + ) + if audio_format.lower() == "wav": + # Convert bytes back to numpy float32 array for playback + # Assuming audio_chunk is raw PCM data (LINEAR16) + samples = ( + np.frombuffer(audio_chunk, dtype=np.int16).astype(np.float32) + / 32767.0 + ) + elif audio_format.lower() in ["mp3", "flac"]: + # For formats like MP3 or FLAC, you need to decode them back to PCM + # This requires additional processing which is not implemented here + # For simplicity, we'll skip playback for non-WAV formats + samples = None + logging.warning( + f"Playback for format '{audio_format}' is not implemented." + ) + else: + raise UnsupportedFileFormat(f"Unsupported format: {audio_format}") + + if samples is not None: + # Add audio samples to the buffer for streaming playback + self.audio_buffer.put(samples) + logging.info(f"Audio chunk {chunk_idx} added to buffer") + + # Write the chunk to the file if saving + if save_to_file_path: + output_file.write( + audio_chunk + ) # Corrected from f.write to output_file.write + + if not self.audio_started and samples is not None: + logging.info("Starting audio playback...") + self.audio_started = True + + # Signal that audio generation is complete + self.audio_stopped = True + + # Wait for playback to finish + playback_thread.join() + logging.info("Playback finished.") + + except Exception as e: + logging.error(f"Error during speak_streamed: {e}") + self.audio_killed = True + + finally: + if output_file: + output_file.close() + logging.info( + f"Audio successfully saved to {save_to_file_path} in {audio_format} format." + ) + + def play_audio(self): + """ + Plays audio from the audio_buffer using sounddevice. + """ + try: + logging.info("Starting audio playback thread...") + with sd.OutputStream( + samplerate=self.audio_rate, + channels=1, + callback=self.play_audio_callback, + blocksize=4096, + dtype="float32", + ): + self.playback_finished.wait() + except Exception as e: + logging.error(f"Error during audio playback: {e}") + self.audio_killed = True diff --git a/tts_wrapper/engines/google/google.py b/tts_wrapper/engines/google/google.py index 9d17a07f..13dde23d 100644 --- a/tts_wrapper/engines/google/google.py +++ b/tts_wrapper/engines/google/google.py @@ -35,7 +35,6 @@ def __init__( self.audio_started = False self.audio_stopped = False self.audio_killed = False - self.is_paused = False # Audio playback callback, called continuously to stream audio from the buffer def play_audio_callback( @@ -325,11 +324,6 @@ def speak_streamed( f"Audio successfully saved to {save_to_file_path} in {audio_format} format." ) - def pause_playback(self): - is_paused = True - - def resume_playback(self): - is_paused = False def play_audio(self): """ diff --git a/tts_wrapper/engines/googletrans/googletrans.py b/tts_wrapper/engines/googletrans/googletrans.py index 8aa70a6f..60658753 100644 --- a/tts_wrapper/engines/googletrans/googletrans.py +++ b/tts_wrapper/engines/googletrans/googletrans.py @@ -1,10 +1,19 @@ # engine.py -from typing import Any, List, Optional, Dict +from typing import Any, List, Optional, Dict, Literal, Tuple, Generator from ...exceptions import UnsupportedFileFormat from ...tts import AbstractTTS, FileFormat from . import GoogleTransClient, GoogleTransSSML + +import re +import numpy as np +import pathlib import logging +import threading +import queue +import sounddevice as sd +import time +from io import BytesIO class GoogleTransTTS(AbstractTTS): @@ -12,6 +21,12 @@ def __init__(self, client: GoogleTransClient): super().__init__() self.client = client self.audio_rate = 24000 + self.audio_buffer = queue.Queue() + self.playback_finished = threading.Event() + self.audio_started = False + self.audio_stopped = False + self.audio_killed = False + self.audio_format = "wav" # Default format def get_voices(self): return self.client.get_voices() @@ -36,3 +51,215 @@ def set_voice(self, voice_id: str, lang_id: Optional[str] = None): def construct_prosody_tag(self, text: str) -> str: # Implement SSML prosody tag construction if needed return text + + def speak_streamed( + self, + text: str, + save_to_file_path: Optional[str] = None, + audio_format: Optional[str] = "wav", + ) -> None: + """ + Synthesizes text and plays it back using sounddevice in a streaming fashion. + Optionally saves the audio to a file after playback completes. + + :param text: The text to synthesize and play. + :param save_to_file_path: Path to save the audio file (optional). + :param audio_format: Audio format to save (e.g., 'wav', 'mp3', 'flac'). + """ + logging.info( + "[GoogletTransTTS.speak_streamed] Starting speech synthesis and playback..." + ) + + # Reset flags + self.audio_started = False + self.audio_stopped = False + self.playback_finished.clear() + + # Open the output file if saving is required + output_file = None + if save_to_file_path: + output_file = open(save_to_file_path, "wb") + logging.info( + f"Saving audio to {save_to_file_path} in {audio_format} format." + ) + + try: + # Start audio playback in a separate thread + playback_thread = threading.Thread(target=self.play_audio) + playback_thread.start() + + # Iterate over the generator returned by synth_to_bytestream + for chunk_idx, audio_chunk in enumerate( + self.synth_to_bytestream(str(text), format=audio_format) + ): + logging.info( + f"Processing audio chunk {chunk_idx} with size {len(audio_chunk)} bytes" + ) + if audio_format.lower() == "wav": + # Convert bytes back to numpy float32 array for playback + # Assuming audio_chunk is raw PCM data (LINEAR16) + samples = ( + np.frombuffer(audio_chunk, dtype=np.int16).astype(np.float32) + / 32767.0 + ) + elif audio_format.lower() in ["mp3", "flac"]: + # For formats like MP3 or FLAC, you need to decode them back to PCM + # This requires additional processing which is not implemented here + # For simplicity, we'll skip playback for non-WAV formats + samples = None + logging.warning( + f"Playback for format '{audio_format}' is not implemented." + ) + else: + raise UnsupportedFileFormat(f"Unsupported format: {audio_format}") + + if samples is not None: + # Add audio samples to the buffer for streaming playback + self.audio_buffer.put(samples) + logging.info(f"Audio chunk {chunk_idx} added to buffer") + + # Write the chunk to the file if saving + if save_to_file_path: + output_file.write( + audio_chunk + ) # Corrected from f.write to output_file.write + + if not self.audio_started and samples is not None: + logging.info("Starting audio playback...") + self.audio_started = True + + # Signal that audio generation is complete + self.audio_stopped = True + + # Wait for playback to finish + playback_thread.join() + logging.info("Playback finished.") + + except Exception as e: + logging.error(f"Error during speak_streamed: {e}") + self.audio_killed = True + + finally: + if output_file: + output_file.close() + logging.info( + f"Audio successfully saved to {save_to_file_path} in {audio_format} format." + ) + + def synth_to_bytestream( + self, text: Any, format: Optional[str] = "wav" + ) -> Generator[bytes, None, None]: + """ + Synthesizes text to an in-memory bytestream in the specified audio format. + Yields audio data chunks as they are generated. + + :param text: The text to synthesize. + :param format: The desired audio format (e.g., 'wav', 'mp3', 'flac'). Defaults to 'wav'. + :return: A generator yielding bytes objects containing audio data. + """ + try: + logging.info(f"[GoogletTransTTS.synth_to_bytestream] Synthesizing text: {text}") + + # Split the text into smaller segments (e.g., sentences) for incremental synthesis + text_segments = self._split_text(text) + + for segment_idx, segment in enumerate(text_segments): + logging.info(f"Synthesizing segment {segment_idx}: {segment}") + #result = self._client.synth( + # str(segment), self._voice, self._lang, include_timepoints=True + #) + #audio_bytes = result["audio_content"] + audio_bytes = self.synth_to_bytes(str(segment)) + + if format.lower() == "wav": + # Yield raw PCM data (skip WAV header if necessary) + # Google TTS returns LINEAR16 PCM in WAV format + audio_stream = BytesIO(audio_bytes) + audio_stream.seek(44) # Skip the 44-byte WAV header + chunk_size = 1024 # Number of bytes per chunk + + while True: + chunk = audio_stream.read(chunk_size) + if not chunk: + break + yield chunk + + elif format.lower() in ["mp3", "flac"]: + # Convert PCM to the desired format using _convert_audio + pcm_data = np.frombuffer(audio_bytes, dtype=np.int16) + converted_audio = self._convert_audio( + pcm_data, format, self.audio_rate + ) + chunk_size = 4096 # Number of bytes per chunk + audio_io = BytesIO(converted_audio) + + while True: + chunk = audio_io.read(chunk_size) + if not chunk: + break + yield chunk + + else: + raise UnsupportedFileFormat(f"Unsupported format: {format}") + + except Exception as e: + logging.error(f"Error in synth_to_bytestream: {e}") + raise + + def play_audio(self): + """ + Plays audio from the audio_buffer using sounddevice. + """ + try: + logging.info("Starting audio playback thread...") + with sd.OutputStream( + samplerate=self.audio_rate, + channels=1, + callback=self.play_audio_callback, + blocksize=4096, + dtype="float32", + ): + self.playback_finished.wait() + except Exception as e: + logging.error(f"Error during audio playback: {e}") + self.audio_killed = True + + def _split_text(self, text: str) -> List[str]: + # Simple sentence splitter based on punctuation. + sentences = re.split(r"(?<=[.!?]) +", text) + return sentences + + # Audio playback callback, called continuously to stream audio from the buffer + def play_audio_callback( + self, outdata: np.ndarray, frames: int, time_info, status: sd.CallbackFlags + ): + if self.audio_killed or ( + self.audio_started and self.audio_buffer.empty() and self.audio_stopped + ): + logging.error("AUDIO KILLED OR STOPPED OR BUFFER EMPTY") + self.playback_finished.set() + return + + if self.audio_buffer.empty(): + outdata.fill(0) + return + + n = 0 + while n < frames and not self.audio_buffer.empty(): + remaining = frames - n + current_chunk = self.audio_buffer.queue[0] + k = current_chunk.shape[0] + + if remaining <= k: + outdata[n:, 0] = current_chunk[:remaining] + self.audio_buffer.queue[0] = current_chunk[remaining:] + n = frames + if self.audio_buffer.queue[0].shape[0] == 0: + self.audio_buffer.get() + break + + outdata[n : n + k, 0] = self.audio_buffer.get() + n += k + + if n < frames: + outdata[n:, 0] = 0 \ No newline at end of file diff --git a/tts_wrapper/engines/microsoft/microsoft.py b/tts_wrapper/engines/microsoft/microsoft.py index 6b9c5056..988bd028 100644 --- a/tts_wrapper/engines/microsoft/microsoft.py +++ b/tts_wrapper/engines/microsoft/microsoft.py @@ -1,5 +1,5 @@ # The MS SpeechSDK can do a lot of our base class - and better. So lets overrride that -from typing import Any, List, Dict, Optional +from typing import Any, List, Dict, Optional, Tuple, Generator from ...exceptions import UnsupportedFileFormat from ...tts import AbstractTTS, FileFormat @@ -17,8 +17,14 @@ except ImportError: speechsdk = None # type: ignore +import numpy as np import logging import threading +import queue +import sounddevice as sd +import time +from io import BytesIO +import re class MicrosoftTTS(AbstractTTS): @@ -32,6 +38,11 @@ def __init__( self._client = client self.set_voice(voice or "en-US-JennyNeural", lang or "en-US") self._ssml = MicrosoftSSML(self._lang, self._voice) + self.audio_buffer = queue.Queue() + self.playback_finished = threading.Event() + self.audio_started = False + self.audio_stopped = False + self.audio_killed = False # Ensure we're requesting word boundary information self._client.speech_config.set_property( @@ -150,3 +161,216 @@ def _is_ssml(self, ssml): @property def ssml(self) -> MicrosoftSSML: return self._ssml + + + def speak_streamed( + self, + text: str, + save_to_file_path: Optional[str] = None, + audio_format: Optional[str] = "wav", + ) -> None: + """ + Synthesizes text and plays it back using sounddevice in a streaming fashion. + Optionally saves the audio to a file after playback completes. + + :param text: The text to synthesize and play. + :param save_to_file_path: Path to save the audio file (optional). + :param audio_format: Audio format to save (e.g., 'wav', 'mp3', 'flac'). + """ + logging.info( + "[MicrosoftTTS.speak_streamed] Starting speech synthesis and playback..." + ) + + # Reset flags + self.audio_started = False + self.audio_stopped = False + self.playback_finished.clear() + + # Open the output file if saving is required + output_file = None + if save_to_file_path: + output_file = open(save_to_file_path, "wb") + logging.info( + f"Saving audio to {save_to_file_path} in {audio_format} format." + ) + + try: + # Start audio playback in a separate thread + playback_thread = threading.Thread(target=self.play_audio) + playback_thread.start() + + # Iterate over the generator returned by synth_to_bytestream + for chunk_idx, audio_chunk in enumerate( + self.synth_to_bytestream(text, format=audio_format) + ): + logging.info( + f"Processing audio chunk {chunk_idx} with size {len(audio_chunk)} bytes" + ) + if audio_format.lower() == "wav": + # Convert bytes back to numpy float32 array for playback + # Assuming audio_chunk is raw PCM data (LINEAR16) + samples = ( + np.frombuffer(audio_chunk, dtype=np.int16).astype(np.float32) + / 32767.0 + ) + elif audio_format.lower() in ["mp3", "flac"]: + # For formats like MP3 or FLAC, you need to decode them back to PCM + # This requires additional processing which is not implemented here + # For simplicity, we'll skip playback for non-WAV formats + samples = None + logging.warning( + f"Playback for format '{audio_format}' is not implemented." + ) + else: + raise UnsupportedFileFormat(f"Unsupported format: {audio_format}") + + if samples is not None: + # Add audio samples to the buffer for streaming playback + self.audio_buffer.put(samples) + logging.info(f"Audio chunk {chunk_idx} added to buffer") + + # Write the chunk to the file if saving + if save_to_file_path: + output_file.write( + audio_chunk + ) # Corrected from f.write to output_file.write + + if not self.audio_started and samples is not None: + logging.info("Starting audio playback...") + self.audio_started = True + + # Signal that audio generation is complete + self.audio_stopped = True + + # Wait for playback to finish + playback_thread.join() + logging.info("Playback finished.") + + except Exception as e: + logging.error(f"Error during speak_streamed: {e}") + self.audio_killed = True + + finally: + if output_file: + output_file.close() + logging.info( + f"Audio successfully saved to {save_to_file_path} in {audio_format} format." + ) + + def synth_to_bytestream( + self, text: Any, format: Optional[str] = "wav" + ) -> Generator[bytes, None, None]: + """ + Synthesizes text to an in-memory bytestream in the specified audio format. + Yields audio data chunks as they are generated. + + :param text: The text to synthesize. + :param format: The desired audio format (e.g., 'wav', 'mp3', 'flac'). Defaults to 'wav'. + :return: A generator yielding bytes objects containing audio data. + """ + try: + logging.info(f"[MicrosoftTTS.synth_to_bytestream] Synthesizing text: {text}") + + # Split the text into smaller segments (e.g., sentences) for incremental synthesis + text_segments = self._split_text(text) + + for segment_idx, segment in enumerate(text_segments): + logging.info(f"Synthesizing segment {segment_idx}: {segment}") + #result = self._client.synth( + # str(segment), self._voice, self._lang, include_timepoints=True + #) + #audio_bytes = result["audio_content"] + audio_bytes = self.synth_to_bytes(str(segment)) + + if format.lower() == "wav": + # Yield raw PCM data (skip WAV header if necessary) + # Google TTS returns LINEAR16 PCM in WAV format + audio_stream = BytesIO(audio_bytes) + audio_stream.seek(44) # Skip the 44-byte WAV header + chunk_size = 1024 # Number of bytes per chunk + + while True: + chunk = audio_stream.read(chunk_size) + if not chunk: + break + yield chunk + + elif format.lower() in ["mp3", "flac"]: + # Convert PCM to the desired format using _convert_audio + pcm_data = np.frombuffer(audio_bytes, dtype=np.int16) + converted_audio = self._convert_audio( + pcm_data, format, self.audio_rate + ) + chunk_size = 4096 # Number of bytes per chunk + audio_io = BytesIO(converted_audio) + + while True: + chunk = audio_io.read(chunk_size) + if not chunk: + break + yield chunk + + else: + raise UnsupportedFileFormat(f"Unsupported format: {format}") + + except Exception as e: + logging.error(f"Error in synth_to_bytestream: {e}") + raise + + def play_audio(self): + """ + Plays audio from the audio_buffer using sounddevice. + """ + try: + logging.info("Starting audio playback thread...") + with sd.OutputStream( + samplerate=self.audio_rate, + channels=1, + callback=self.play_audio_callback, + blocksize=4096, + dtype="float32", + ): + self.playback_finished.wait() + except Exception as e: + logging.error(f"Error during audio playback: {e}") + self.audio_killed = True + + def _split_text(self, text: str) -> List[str]: + # Simple sentence splitter based on punctuation. + sentences = re.split(r"(?<=[.!?]) +", text) + return sentences + + # Audio playback callback, called continuously to stream audio from the buffer + def play_audio_callback( + self, outdata: np.ndarray, frames: int, time_info, status: sd.CallbackFlags + ): + if self.audio_killed or ( + self.audio_started and self.audio_buffer.empty() and self.audio_stopped + ): + logging.error("AUDIO KILLED OR STOPPED OR BUFFER EMPTY") + self.playback_finished.set() + return + + if self.audio_buffer.empty(): + outdata.fill(0) + return + + n = 0 + while n < frames and not self.audio_buffer.empty(): + remaining = frames - n + current_chunk = self.audio_buffer.queue[0] + k = current_chunk.shape[0] + + if remaining <= k: + outdata[n:, 0] = current_chunk[:remaining] + self.audio_buffer.queue[0] = current_chunk[remaining:] + n = frames + if self.audio_buffer.queue[0].shape[0] == 0: + self.audio_buffer.get() + break + + outdata[n : n + k, 0] = self.audio_buffer.get() + n += k + + if n < frames: + outdata[n:, 0] = 0 \ No newline at end of file diff --git a/tts_wrapper/engines/mms/mms.py b/tts_wrapper/engines/mms/mms.py index 4345d351..140d8f86 100644 --- a/tts_wrapper/engines/mms/mms.py +++ b/tts_wrapper/engines/mms/mms.py @@ -1,9 +1,16 @@ -from typing import Any, List, Dict, Optional +from typing import Any, List, Dict, Optional, Tuple, Generator from ...exceptions import UnsupportedFileFormat from ...tts import AbstractTTS, FileFormat from . import MMSClient, MMSSSML + +import numpy as np +import logging +import threading +import queue +import sounddevice as sd +import time +from io import BytesIO import re -import io try: import numpy as np @@ -116,3 +123,216 @@ def ssml(self) -> MMSSSML: def get_voices(self) -> List[Dict[str, Any]]: return self._client.get_voices() + + + def speak_streamed( + self, + text: str, + save_to_file_path: Optional[str] = None, + audio_format: Optional[str] = "wav", + ) -> None: + """ + Synthesizes text and plays it back using sounddevice in a streaming fashion. + Optionally saves the audio to a file after playback completes. + + :param text: The text to synthesize and play. + :param save_to_file_path: Path to save the audio file (optional). + :param audio_format: Audio format to save (e.g., 'wav', 'mp3', 'flac'). + """ + logging.info( + "[MMSTTS.speak_streamed] Starting speech synthesis and playback..." + ) + + # Reset flags + self.audio_started = False + self.audio_stopped = False + self.playback_finished.clear() + + # Open the output file if saving is required + output_file = None + if save_to_file_path: + output_file = open(save_to_file_path, "wb") + logging.info( + f"Saving audio to {save_to_file_path} in {audio_format} format." + ) + + try: + # Start audio playback in a separate thread + playback_thread = threading.Thread(target=self.play_audio) + playback_thread.start() + + # Iterate over the generator returned by synth_to_bytestream + for chunk_idx, audio_chunk in enumerate( + self.synth_to_bytestream(text, format=audio_format) + ): + logging.info( + f"Processing audio chunk {chunk_idx} with size {len(audio_chunk)} bytes" + ) + if audio_format.lower() == "wav": + # Convert bytes back to numpy float32 array for playback + # Assuming audio_chunk is raw PCM data (LINEAR16) + samples = ( + np.frombuffer(audio_chunk, dtype=np.int16).astype(np.float32) + / 32767.0 + ) + elif audio_format.lower() in ["mp3", "flac"]: + # For formats like MP3 or FLAC, you need to decode them back to PCM + # This requires additional processing which is not implemented here + # For simplicity, we'll skip playback for non-WAV formats + samples = None + logging.warning( + f"Playback for format '{audio_format}' is not implemented." + ) + else: + raise UnsupportedFileFormat(f"Unsupported format: {audio_format}") + + if samples is not None: + # Add audio samples to the buffer for streaming playback + self.audio_buffer.put(samples) + logging.info(f"Audio chunk {chunk_idx} added to buffer") + + # Write the chunk to the file if saving + if save_to_file_path: + output_file.write( + audio_chunk + ) # Corrected from f.write to output_file.write + + if not self.audio_started and samples is not None: + logging.info("Starting audio playback...") + self.audio_started = True + + # Signal that audio generation is complete + self.audio_stopped = True + + # Wait for playback to finish + playback_thread.join() + logging.info("Playback finished.") + + except Exception as e: + logging.error(f"Error during speak_streamed: {e}") + self.audio_killed = True + + finally: + if output_file: + output_file.close() + logging.info( + f"Audio successfully saved to {save_to_file_path} in {audio_format} format." + ) + + def synth_to_bytestream( + self, text: Any, format: Optional[str] = "wav" + ) -> Generator[bytes, None, None]: + """ + Synthesizes text to an in-memory bytestream in the specified audio format. + Yields audio data chunks as they are generated. + + :param text: The text to synthesize. + :param format: The desired audio format (e.g., 'wav', 'mp3', 'flac'). Defaults to 'wav'. + :return: A generator yielding bytes objects containing audio data. + """ + try: + logging.info(f"[MMSTTS.synth_to_bytestream] Synthesizing text: {text}") + + # Split the text into smaller segments (e.g., sentences) for incremental synthesis + text_segments = self._split_text(text) + + for segment_idx, segment in enumerate(text_segments): + logging.info(f"Synthesizing segment {segment_idx}: {segment}") + #result = self._client.synth( + # str(segment), self._voice, self._lang, include_timepoints=True + #) + #audio_bytes = result["audio_content"] + audio_bytes = self.synth(text=str(segment)) + + if format.lower() == "wav": + # Yield raw PCM data (skip WAV header if necessary) + # Google TTS returns LINEAR16 PCM in WAV format + audio_stream = BytesIO(audio_bytes) + audio_stream.seek(44) # Skip the 44-byte WAV header + chunk_size = 1024 # Number of bytes per chunk + + while True: + chunk = audio_stream.read(chunk_size) + if not chunk: + break + yield chunk + + elif format.lower() in ["mp3", "flac"]: + # Convert PCM to the desired format using _convert_audio + pcm_data = np.frombuffer(audio_bytes, dtype=np.int16) + converted_audio = self._convert_audio( + pcm_data, format, self.audio_rate + ) + chunk_size = 4096 # Number of bytes per chunk + audio_io = BytesIO(converted_audio) + + while True: + chunk = audio_io.read(chunk_size) + if not chunk: + break + yield chunk + + else: + raise UnsupportedFileFormat(f"Unsupported format: {format}") + + except Exception as e: + logging.error(f"Error in synth_to_bytestream: {e}") + raise + + def play_audio(self): + """ + Plays audio from the audio_buffer using sounddevice. + """ + try: + logging.info("Starting audio playback thread...") + with sd.OutputStream( + samplerate=self.audio_rate, + channels=1, + callback=self.play_audio_callback, + blocksize=4096, + dtype="float32", + ): + self.playback_finished.wait() + except Exception as e: + logging.error(f"Error during audio playback: {e}") + self.audio_killed = True + + def _split_text(self, text: str) -> List[str]: + # Simple sentence splitter based on punctuation. + sentences = re.split(r"(?<=[.!?]) +", text) + return sentences + + # Audio playback callback, called continuously to stream audio from the buffer + def play_audio_callback( + self, outdata: np.ndarray, frames: int, time_info, status: sd.CallbackFlags + ): + if self.audio_killed or ( + self.audio_started and self.audio_buffer.empty() and self.audio_stopped + ): + logging.error("AUDIO KILLED OR STOPPED OR BUFFER EMPTY") + self.playback_finished.set() + return + + if self.audio_buffer.empty(): + outdata.fill(0) + return + + n = 0 + while n < frames and not self.audio_buffer.empty(): + remaining = frames - n + current_chunk = self.audio_buffer.queue[0] + k = current_chunk.shape[0] + + if remaining <= k: + outdata[n:, 0] = current_chunk[:remaining] + self.audio_buffer.queue[0] = current_chunk[remaining:] + n = frames + if self.audio_buffer.queue[0].shape[0] == 0: + self.audio_buffer.get() + break + + outdata[n : n + k, 0] = self.audio_buffer.get() + n += k + + if n < frames: + outdata[n:, 0] = 0 \ No newline at end of file diff --git a/tts_wrapper/engines/pico/pico.py b/tts_wrapper/engines/pico/pico.py index a2bb348c..58bfc61d 100644 --- a/tts_wrapper/engines/pico/pico.py +++ b/tts_wrapper/engines/pico/pico.py @@ -1,9 +1,17 @@ -from typing import Any, List, Optional +from typing import Any, List, Dict, Optional, Tuple, Generator from ...exceptions import UnsupportedFileFormat from ...tts import AbstractTTS, FileFormat from . import PicoClient +import numpy as np +import logging +import threading +import queue +import sounddevice as sd +import time +from io import BytesIO +import re class PicoTTS(AbstractTTS): def __init__(self, client: PicoClient, voice: Optional[str] = None) -> None: @@ -15,3 +23,217 @@ def synth_to_bytes(self, text: Any) -> bytes: def construct_prosody_tag(self, text: str) -> str: pass + + + def speak_streamed( + self, + text: str, + save_to_file_path: Optional[str] = None, + audio_format: Optional[str] = "wav", + ) -> None: + """ + Synthesizes text and plays it back using sounddevice in a streaming fashion. + Optionally saves the audio to a file after playback completes. + + :param text: The text to synthesize and play. + :param save_to_file_path: Path to save the audio file (optional). + :param audio_format: Audio format to save (e.g., 'wav', 'mp3', 'flac'). + """ + logging.info( + "[PicoTTS.speak_streamed] Starting speech synthesis and playback..." + ) + + # Reset flags + self.audio_started = False + self.audio_stopped = False + self.playback_finished.clear() + + # Open the output file if saving is required + output_file = None + if save_to_file_path: + output_file = open(save_to_file_path, "wb") + logging.info( + f"Saving audio to {save_to_file_path} in {audio_format} format." + ) + + try: + # Start audio playback in a separate thread + playback_thread = threading.Thread(target=self.play_audio) + playback_thread.start() + + # Iterate over the generator returned by synth_to_bytestream + for chunk_idx, audio_chunk in enumerate( + self.synth_to_bytestream(text, format=audio_format) + ): + logging.info( + f"Processing audio chunk {chunk_idx} with size {len(audio_chunk)} bytes" + ) + if audio_format.lower() == "wav": + # Convert bytes back to numpy float32 array for playback + # Assuming audio_chunk is raw PCM data (LINEAR16) + samples = ( + np.frombuffer(audio_chunk, dtype=np.int16).astype(np.float32) + / 32767.0 + ) + elif audio_format.lower() in ["mp3", "flac"]: + # For formats like MP3 or FLAC, you need to decode them back to PCM + # This requires additional processing which is not implemented here + # For simplicity, we'll skip playback for non-WAV formats + samples = None + logging.warning( + f"Playback for format '{audio_format}' is not implemented." + ) + else: + raise UnsupportedFileFormat(f"Unsupported format: {audio_format}") + + if samples is not None: + # Add audio samples to the buffer for streaming playback + self.audio_buffer.put(samples) + logging.info(f"Audio chunk {chunk_idx} added to buffer") + + # Write the chunk to the file if saving + if save_to_file_path: + output_file.write( + audio_chunk + ) # Corrected from f.write to output_file.write + + if not self.audio_started and samples is not None: + logging.info("Starting audio playback...") + self.audio_started = True + + # Signal that audio generation is complete + self.audio_stopped = True + + # Wait for playback to finish + playback_thread.join() + logging.info("Playback finished.") + + except Exception as e: + logging.error(f"Error during speak_streamed: {e}") + self.audio_killed = True + + finally: + if output_file: + output_file.close() + logging.info( + f"Audio successfully saved to {save_to_file_path} in {audio_format} format." + ) + + + def synth_to_bytestream( + self, text: Any, format: Optional[str] = "wav" + ) -> Generator[bytes, None, None]: + """ + Synthesizes text to an in-memory bytestream in the specified audio format. + Yields audio data chunks as they are generated. + + :param text: The text to synthesize. + :param format: The desired audio format (e.g., 'wav', 'mp3', 'flac'). Defaults to 'wav'. + :return: A generator yielding bytes objects containing audio data. + """ + try: + logging.info(f"[GoogleTTS.synth_to_bytestream] Synthesizing text: {text}") + + # Split the text into smaller segments (e.g., sentences) for incremental synthesis + text_segments = self._split_text(text) + + for segment_idx, segment in enumerate(text_segments): + logging.info(f"Synthesizing segment {segment_idx}: {segment}") + #result = self._client.synth( + # str(segment), self._voice, self._lang, include_timepoints=True + #) + #audio_bytes = result["audio_content"] + audio_bytes = self.synth(str(segment), self._voice) + + if format.lower() == "wav": + # Yield raw PCM data (skip WAV header if necessary) + # Google TTS returns LINEAR16 PCM in WAV format + audio_stream = BytesIO(audio_bytes) + audio_stream.seek(44) # Skip the 44-byte WAV header + chunk_size = 1024 # Number of bytes per chunk + + while True: + chunk = audio_stream.read(chunk_size) + if not chunk: + break + yield chunk + + elif format.lower() in ["mp3", "flac"]: + # Convert PCM to the desired format using _convert_audio + pcm_data = np.frombuffer(audio_bytes, dtype=np.int16) + converted_audio = self._convert_audio( + pcm_data, format, self.audio_rate + ) + chunk_size = 4096 # Number of bytes per chunk + audio_io = BytesIO(converted_audio) + + while True: + chunk = audio_io.read(chunk_size) + if not chunk: + break + yield chunk + + else: + raise UnsupportedFileFormat(f"Unsupported format: {format}") + + except Exception as e: + logging.error(f"Error in synth_to_bytestream: {e}") + raise + + def play_audio(self): + """ + Plays audio from the audio_buffer using sounddevice. + """ + try: + logging.info("Starting audio playback thread...") + with sd.OutputStream( + samplerate=self.audio_rate, + channels=1, + callback=self.play_audio_callback, + blocksize=4096, + dtype="float32", + ): + self.playback_finished.wait() + except Exception as e: + logging.error(f"Error during audio playback: {e}") + self.audio_killed = True + + def _split_text(self, text: str) -> List[str]: + # Simple sentence splitter based on punctuation. + sentences = re.split(r"(?<=[.!?]) +", text) + return sentences + + # Audio playback callback, called continuously to stream audio from the buffer + def play_audio_callback( + self, outdata: np.ndarray, frames: int, time_info, status: sd.CallbackFlags + ): + if self.audio_killed or ( + self.audio_started and self.audio_buffer.empty() and self.audio_stopped + ): + logging.error("AUDIO KILLED OR STOPPED OR BUFFER EMPTY") + self.playback_finished.set() + return + + if self.audio_buffer.empty(): + outdata.fill(0) + return + + n = 0 + while n < frames and not self.audio_buffer.empty(): + remaining = frames - n + current_chunk = self.audio_buffer.queue[0] + k = current_chunk.shape[0] + + if remaining <= k: + outdata[n:, 0] = current_chunk[:remaining] + self.audio_buffer.queue[0] = current_chunk[remaining:] + n = frames + if self.audio_buffer.queue[0].shape[0] == 0: + self.audio_buffer.get() + break + + outdata[n : n + k, 0] = self.audio_buffer.get() + n += k + + if n < frames: + outdata[n:, 0] = 0 \ No newline at end of file diff --git a/tts_wrapper/engines/polly/polly.py b/tts_wrapper/engines/polly/polly.py index 1ef109f2..58794206 100644 --- a/tts_wrapper/engines/polly/polly.py +++ b/tts_wrapper/engines/polly/polly.py @@ -1,7 +1,16 @@ -from typing import Any, List, Optional, Dict, Tuple +from typing import Any, List, Dict, Optional, Tuple, Generator from ...exceptions import UnsupportedFileFormat from ...tts import AbstractTTS, FileFormat from . import PollyClient, PollySSML +import re +import numpy as np +import pathlib +import logging +import threading +import queue +import sounddevice as sd +import time +from io import BytesIO class PollyTTS(AbstractTTS): @@ -15,6 +24,12 @@ def __init__( self._client = client self.set_voice(voice or "Joanna", lang or "en-US") self.audio_rate = 16000 + self.audio_buffer = queue.Queue() + self.playback_finished = threading.Event() + self.audio_started = False + self.audio_stopped = False + self.audio_killed = False + self.audio_format = "wav" # Default format def synth_to_bytes(self, text: Any) -> bytes: if not self._is_ssml(str(text)): @@ -97,3 +112,216 @@ def mapped_to_predefined_word(self, volume: str) -> str: return "loud" if 81 <= volume_in_float <= 100: return "x-loud" + + + def speak_streamed( + self, + text: str, + save_to_file_path: Optional[str] = None, + audio_format: Optional[str] = "wav", + ) -> None: + """ + Synthesizes text and plays it back using sounddevice in a streaming fashion. + Optionally saves the audio to a file after playback completes. + + :param text: The text to synthesize and play. + :param save_to_file_path: Path to save the audio file (optional). + :param audio_format: Audio format to save (e.g., 'wav', 'mp3', 'flac'). + """ + logging.info( + "[PollyTTS.speak_streamed] Starting speech synthesis and playback..." + ) + + # Reset flags + self.audio_started = False + self.audio_stopped = False + self.playback_finished.clear() + + # Open the output file if saving is required + output_file = None + if save_to_file_path: + output_file = open(save_to_file_path, "wb") + logging.info( + f"Saving audio to {save_to_file_path} in {audio_format} format." + ) + + try: + # Start audio playback in a separate thread + playback_thread = threading.Thread(target=self.play_audio) + playback_thread.start() + + # Iterate over the generator returned by synth_to_bytestream + for chunk_idx, audio_chunk in enumerate( + self.synth_to_bytestream(str(text), format=audio_format) + ): + logging.info( + f"Processing audio chunk {chunk_idx} with size {len(audio_chunk)} bytes" + ) + if audio_format.lower() == "wav": + # Convert bytes back to numpy float32 array for playback + # Assuming audio_chunk is raw PCM data (LINEAR16) + samples = ( + np.frombuffer(audio_chunk, dtype=np.int16).astype(np.float32) + / 32767.0 + ) + elif audio_format.lower() in ["mp3", "flac"]: + # For formats like MP3 or FLAC, you need to decode them back to PCM + # This requires additional processing which is not implemented here + # For simplicity, we'll skip playback for non-WAV formats + samples = None + logging.warning( + f"Playback for format '{audio_format}' is not implemented." + ) + else: + raise UnsupportedFileFormat(f"Unsupported format: {audio_format}") + + if samples is not None: + # Add audio samples to the buffer for streaming playback + self.audio_buffer.put(samples) + logging.info(f"Audio chunk {chunk_idx} added to buffer") + + # Write the chunk to the file if saving + if save_to_file_path: + output_file.write( + audio_chunk + ) # Corrected from f.write to output_file.write + + if not self.audio_started and samples is not None: + logging.info("Starting audio playback...") + self.audio_started = True + + # Signal that audio generation is complete + self.audio_stopped = True + + # Wait for playback to finish + playback_thread.join() + logging.info("Playback finished.") + + except Exception as e: + logging.error(f"Error during speak_streamed: {e}") + self.audio_killed = True + + finally: + if output_file: + output_file.close() + logging.info( + f"Audio successfully saved to {save_to_file_path} in {audio_format} format." + ) + + def synth_to_bytestream( + self, text: Any, format: Optional[str] = "wav" + ) -> Generator[bytes, None, None]: + """ + Synthesizes text to an in-memory bytestream in the specified audio format. + Yields audio data chunks as they are generated. + + :param text: The text to synthesize. + :param format: The desired audio format (e.g., 'wav', 'mp3', 'flac'). Defaults to 'wav'. + :return: A generator yielding bytes objects containing audio data. + """ + try: + logging.info(f"[PollyTTS.synth_to_bytestream] Synthesizing text: {text}") + + # Split the text into smaller segments (e.g., sentences) for incremental synthesis + text_segments = self._split_text(text) + + for segment_idx, segment in enumerate(text_segments): + logging.info(f"Synthesizing segment {segment_idx}: {segment}") + #result = self._client.synth( + # str(segment), self._voice, self._lang, include_timepoints=True + #) + #audio_bytes = result["audio_content"] + audio_bytes = self.synth_to_bytes(str(segment)) + + if format.lower() == "wav": + # Yield raw PCM data (skip WAV header if necessary) + # Google TTS returns LINEAR16 PCM in WAV format + audio_stream = BytesIO(audio_bytes) + audio_stream.seek(44) # Skip the 44-byte WAV header + chunk_size = 1024 # Number of bytes per chunk + + while True: + chunk = audio_stream.read(chunk_size) + if not chunk: + break + yield chunk + + elif format.lower() in ["mp3", "flac"]: + # Convert PCM to the desired format using _convert_audio + pcm_data = np.frombuffer(audio_bytes, dtype=np.int16) + converted_audio = self._convert_audio( + pcm_data, format, self.audio_rate + ) + chunk_size = 4096 # Number of bytes per chunk + audio_io = BytesIO(converted_audio) + + while True: + chunk = audio_io.read(chunk_size) + if not chunk: + break + yield chunk + + else: + raise UnsupportedFileFormat(f"Unsupported format: {format}") + + except Exception as e: + logging.error(f"Error in synth_to_bytestream: {e}") + raise + + def play_audio(self): + """ + Plays audio from the audio_buffer using sounddevice. + """ + try: + logging.info("Starting audio playback thread...") + with sd.OutputStream( + samplerate=self.audio_rate, + channels=1, + callback=self.play_audio_callback, + blocksize=4096, + dtype="float32", + ): + self.playback_finished.wait() + except Exception as e: + logging.error(f"Error during audio playback: {e}") + self.audio_killed = True + + def _split_text(self, text: str) -> List[str]: + # Simple sentence splitter based on punctuation. + sentences = re.split(r"(?<=[.!?]) +", text) + return sentences + + # Audio playback callback, called continuously to stream audio from the buffer + def play_audio_callback( + self, outdata: np.ndarray, frames: int, time_info, status: sd.CallbackFlags + ): + if self.audio_killed or ( + self.audio_started and self.audio_buffer.empty() and self.audio_stopped + ): + logging.error("AUDIO KILLED OR STOPPED OR BUFFER EMPTY") + self.playback_finished.set() + return + + if self.audio_buffer.empty(): + outdata.fill(0) + return + + n = 0 + while n < frames and not self.audio_buffer.empty(): + remaining = frames - n + current_chunk = self.audio_buffer.queue[0] + k = current_chunk.shape[0] + + if remaining <= k: + outdata[n:, 0] = current_chunk[:remaining] + self.audio_buffer.queue[0] = current_chunk[remaining:] + n = frames + if self.audio_buffer.queue[0].shape[0] == 0: + self.audio_buffer.get() + break + + outdata[n : n + k, 0] = self.audio_buffer.get() + n += k + + if n < frames: + outdata[n:, 0] = 0 diff --git a/tts_wrapper/engines/sapi/sapi.py b/tts_wrapper/engines/sapi/sapi.py index 0edb0ca2..2a71f8db 100644 --- a/tts_wrapper/engines/sapi/sapi.py +++ b/tts_wrapper/engines/sapi/sapi.py @@ -1,8 +1,17 @@ -from typing import Any, List, Optional, Literal +from typing import Any, List, Optional, Literal, Tuple, Generator from ...exceptions import UnsupportedFileFormat from ...tts import AbstractTTS, FileFormat from . import SAPIClient from .ssml import SAPISSML +import re +import numpy as np +import pathlib +import logging +import threading +import queue +import sounddevice as sd +import time +from io import BytesIO class SAPITTS(AbstractTTS): @@ -13,6 +22,12 @@ def supported_formats(cls) -> List[FileFormat]: def __init__(self, client: SAPIClient) -> None: super().__init__() self._client = client + self.audio_buffer = queue.Queue() + self.playback_finished = threading.Event() + self.audio_started = False + self.audio_stopped = False + self.audio_killed = False + self.audio_format = "wav" # Default format def set_voice(self, voice_id: str, lang_id: Optional[str] = None): """Set the TTS voice by ID and optionally set the language ID.""" @@ -76,3 +91,215 @@ def _map_volume_to_predefined_word(self, volume: str) -> str: return "x-loud" else: return "medium" + + def speak_streamed( + self, + text: str, + save_to_file_path: Optional[str] = None, + audio_format: Optional[str] = "wav", + ) -> None: + """ + Synthesizes text and plays it back using sounddevice in a streaming fashion. + Optionally saves the audio to a file after playback completes. + + :param text: The text to synthesize and play. + :param save_to_file_path: Path to save the audio file (optional). + :param audio_format: Audio format to save (e.g., 'wav', 'mp3', 'flac'). + """ + logging.info( + "[SAPITTS.speak_streamed] Starting speech synthesis and playback..." + ) + + # Reset flags + self.audio_started = False + self.audio_stopped = False + self.playback_finished.clear() + + # Open the output file if saving is required + output_file = None + if save_to_file_path: + output_file = open(save_to_file_path, "wb") + logging.info( + f"Saving audio to {save_to_file_path} in {audio_format} format." + ) + + try: + # Start audio playback in a separate thread + playback_thread = threading.Thread(target=self.play_audio) + playback_thread.start() + + # Iterate over the generator returned by synth_to_bytestream + for chunk_idx, audio_chunk in enumerate( + self.synth_to_bytestream(str(text), format=audio_format) + ): + logging.info( + f"Processing audio chunk {chunk_idx} with size {len(audio_chunk)} bytes" + ) + if audio_format.lower() == "wav": + # Convert bytes back to numpy float32 array for playback + # Assuming audio_chunk is raw PCM data (LINEAR16) + samples = ( + np.frombuffer(audio_chunk, dtype=np.int16).astype(np.float32) + / 32767.0 + ) + elif audio_format.lower() in ["mp3", "flac"]: + # For formats like MP3 or FLAC, you need to decode them back to PCM + # This requires additional processing which is not implemented here + # For simplicity, we'll skip playback for non-WAV formats + samples = None + logging.warning( + f"Playback for format '{audio_format}' is not implemented." + ) + else: + raise UnsupportedFileFormat(f"Unsupported format: {audio_format}") + + if samples is not None: + # Add audio samples to the buffer for streaming playback + self.audio_buffer.put(samples) + logging.info(f"Audio chunk {chunk_idx} added to buffer") + + # Write the chunk to the file if saving + if save_to_file_path: + output_file.write( + audio_chunk + ) # Corrected from f.write to output_file.write + + if not self.audio_started and samples is not None: + logging.info("Starting audio playback...") + self.audio_started = True + + # Signal that audio generation is complete + self.audio_stopped = True + + # Wait for playback to finish + playback_thread.join() + logging.info("Playback finished.") + + except Exception as e: + logging.error(f"Error during speak_streamed: {e}") + self.audio_killed = True + + finally: + if output_file: + output_file.close() + logging.info( + f"Audio successfully saved to {save_to_file_path} in {audio_format} format." + ) + + def synth_to_bytestream( + self, text: Any, format: Optional[str] = "wav" + ) -> Generator[bytes, None, None]: + """ + Synthesizes text to an in-memory bytestream in the specified audio format. + Yields audio data chunks as they are generated. + + :param text: The text to synthesize. + :param format: The desired audio format (e.g., 'wav', 'mp3', 'flac'). Defaults to 'wav'. + :return: A generator yielding bytes objects containing audio data. + """ + try: + logging.info(f"[SAPITTS.synth_to_bytestream] Synthesizing text: {text}") + + # Split the text into smaller segments (e.g., sentences) for incremental synthesis + text_segments = self._split_text(text) + + for segment_idx, segment in enumerate(text_segments): + logging.info(f"Synthesizing segment {segment_idx}: {segment}") + #result = self._client.synth( + # str(segment), self._voice, self._lang, include_timepoints=True + #) + #audio_bytes = result["audio_content"] + audio_bytes = self._client.synth(str(segment)) + + if format.lower() == "wav": + # Yield raw PCM data (skip WAV header if necessary) + # Google TTS returns LINEAR16 PCM in WAV format + audio_stream = BytesIO(audio_bytes) + audio_stream.seek(44) # Skip the 44-byte WAV header + chunk_size = 1024 # Number of bytes per chunk + + while True: + chunk = audio_stream.read(chunk_size) + if not chunk: + break + yield chunk + + elif format.lower() in ["mp3", "flac"]: + # Convert PCM to the desired format using _convert_audio + pcm_data = np.frombuffer(audio_bytes, dtype=np.int16) + converted_audio = self._convert_audio( + pcm_data, format, self.audio_rate + ) + chunk_size = 4096 # Number of bytes per chunk + audio_io = BytesIO(converted_audio) + + while True: + chunk = audio_io.read(chunk_size) + if not chunk: + break + yield chunk + + else: + raise UnsupportedFileFormat(f"Unsupported format: {format}") + + except Exception as e: + logging.error(f"Error in synth_to_bytestream: {e}") + raise + + def play_audio(self): + """ + Plays audio from the audio_buffer using sounddevice. + """ + try: + logging.info("Starting audio playback thread...") + with sd.OutputStream( + samplerate=self.audio_rate, + channels=1, + callback=self.play_audio_callback, + blocksize=4096, + dtype="float32", + ): + self.playback_finished.wait() + except Exception as e: + logging.error(f"Error during audio playback: {e}") + self.audio_killed = True + + def _split_text(self, text: str) -> List[str]: + # Simple sentence splitter based on punctuation. + sentences = re.split(r"(?<=[.!?]) +", text) + return sentences + + # Audio playback callback, called continuously to stream audio from the buffer + def play_audio_callback( + self, outdata: np.ndarray, frames: int, time_info, status: sd.CallbackFlags + ): + if self.audio_killed or ( + self.audio_started and self.audio_buffer.empty() and self.audio_stopped + ): + logging.error("AUDIO KILLED OR STOPPED OR BUFFER EMPTY") + self.playback_finished.set() + return + + if self.audio_buffer.empty(): + outdata.fill(0) + return + + n = 0 + while n < frames and not self.audio_buffer.empty(): + remaining = frames - n + current_chunk = self.audio_buffer.queue[0] + k = current_chunk.shape[0] + + if remaining <= k: + outdata[n:, 0] = current_chunk[:remaining] + self.audio_buffer.queue[0] = current_chunk[remaining:] + n = frames + if self.audio_buffer.queue[0].shape[0] == 0: + self.audio_buffer.get() + break + + outdata[n : n + k, 0] = self.audio_buffer.get() + n += k + + if n < frames: + outdata[n:, 0] = 0 diff --git a/tts_wrapper/engines/watson/watson.py b/tts_wrapper/engines/watson/watson.py index 67a2bba9..0d93c2ac 100644 --- a/tts_wrapper/engines/watson/watson.py +++ b/tts_wrapper/engines/watson/watson.py @@ -1,8 +1,17 @@ -from typing import Any, List, Optional, Dict, Tuple +from typing import Any, List, Optional, Dict, Literal, Tuple, Generator from ...exceptions import UnsupportedFileFormat from ...tts import AbstractTTS, FileFormat from . import WatsonClient, WatsonSSML + +import re +import numpy as np +import pathlib import logging +import threading +import queue +import sounddevice as sd +import time +from io import BytesIO class WatsonTTS(AbstractTTS): @@ -17,6 +26,12 @@ def __init__( self._voice = voice or "en-US_LisaV3Voice" self.audio_rate = 22050 self.word_timings = [] + self.audio_buffer = queue.Queue() + self.playback_finished = threading.Event() + self.audio_started = False + self.audio_stopped = False + self.audio_killed = False + self.audio_format = "wav" # Default format def get_audio_duration(self) -> float: if self.generated_audio: @@ -85,3 +100,215 @@ def set_voice(self, voice_id: str, lang_id: str): def construct_prosody_tag(self, text: str) -> str: pass + + def speak_streamed( + self, + text: str, + save_to_file_path: Optional[str] = None, + audio_format: Optional[str] = "wav", + ) -> None: + """ + Synthesizes text and plays it back using sounddevice in a streaming fashion. + Optionally saves the audio to a file after playback completes. + + :param text: The text to synthesize and play. + :param save_to_file_path: Path to save the audio file (optional). + :param audio_format: Audio format to save (e.g., 'wav', 'mp3', 'flac'). + """ + logging.info( + "[WatsonTTS.speak_streamed] Starting speech synthesis and playback..." + ) + + # Reset flags + self.audio_started = False + self.audio_stopped = False + self.playback_finished.clear() + + # Open the output file if saving is required + output_file = None + if save_to_file_path: + output_file = open(save_to_file_path, "wb") + logging.info( + f"Saving audio to {save_to_file_path} in {audio_format} format." + ) + + try: + # Start audio playback in a separate thread + playback_thread = threading.Thread(target=self.play_audio) + playback_thread.start() + + # Iterate over the generator returned by synth_to_bytestream + for chunk_idx, audio_chunk in enumerate( + self.synth_to_bytestream(str(text), format=audio_format) + ): + logging.info( + f"Processing audio chunk {chunk_idx} with size {len(audio_chunk)} bytes" + ) + if audio_format.lower() == "wav": + # Convert bytes back to numpy float32 array for playback + # Assuming audio_chunk is raw PCM data (LINEAR16) + samples = ( + np.frombuffer(audio_chunk, dtype=np.int16).astype(np.float32) + / 32767.0 + ) + elif audio_format.lower() in ["mp3", "flac"]: + # For formats like MP3 or FLAC, you need to decode them back to PCM + # This requires additional processing which is not implemented here + # For simplicity, we'll skip playback for non-WAV formats + samples = None + logging.warning( + f"Playback for format '{audio_format}' is not implemented." + ) + else: + raise UnsupportedFileFormat(f"Unsupported format: {audio_format}") + + if samples is not None: + # Add audio samples to the buffer for streaming playback + self.audio_buffer.put(samples) + logging.info(f"Audio chunk {chunk_idx} added to buffer") + + # Write the chunk to the file if saving + if save_to_file_path: + output_file.write( + audio_chunk + ) # Corrected from f.write to output_file.write + + if not self.audio_started and samples is not None: + logging.info("Starting audio playback...") + self.audio_started = True + + # Signal that audio generation is complete + self.audio_stopped = True + + # Wait for playback to finish + playback_thread.join() + logging.info("Playback finished.") + + except Exception as e: + logging.error(f"Error during speak_streamed: {e}") + self.audio_killed = True + + finally: + if output_file: + output_file.close() + logging.info( + f"Audio successfully saved to {save_to_file_path} in {audio_format} format." + ) + + def synth_to_bytestream( + self, text: Any, format: Optional[str] = "wav" + ) -> Generator[bytes, None, None]: + """ + Synthesizes text to an in-memory bytestream in the specified audio format. + Yields audio data chunks as they are generated. + + :param text: The text to synthesize. + :param format: The desired audio format (e.g., 'wav', 'mp3', 'flac'). Defaults to 'wav'. + :return: A generator yielding bytes objects containing audio data. + """ + try: + logging.info(f"[WatsonTTS.synth_to_bytestream] Synthesizing text: {text}") + + # Split the text into smaller segments (e.g., sentences) for incremental synthesis + text_segments = self._split_text(text) + + for segment_idx, segment in enumerate(text_segments): + logging.info(f"Synthesizing segment {segment_idx}: {segment}") + #result = self._client.synth( + # str(segment), self._voice, self._lang, include_timepoints=True + #) + #audio_bytes = result["audio_content"] + audio_bytes = self.synth_to_bytes(str(segment)) + + if format.lower() == "wav": + # Yield raw PCM data (skip WAV header if necessary) + # Google TTS returns LINEAR16 PCM in WAV format + audio_stream = BytesIO(audio_bytes) + audio_stream.seek(44) # Skip the 44-byte WAV header + chunk_size = 1024 # Number of bytes per chunk + + while True: + chunk = audio_stream.read(chunk_size) + if not chunk: + break + yield chunk + + elif format.lower() in ["mp3", "flac"]: + # Convert PCM to the desired format using _convert_audio + pcm_data = np.frombuffer(audio_bytes, dtype=np.int16) + converted_audio = self._convert_audio( + pcm_data, format, self.audio_rate + ) + chunk_size = 4096 # Number of bytes per chunk + audio_io = BytesIO(converted_audio) + + while True: + chunk = audio_io.read(chunk_size) + if not chunk: + break + yield chunk + + else: + raise UnsupportedFileFormat(f"Unsupported format: {format}") + + except Exception as e: + logging.error(f"Error in synth_to_bytestream: {e}") + raise + + def play_audio(self): + """ + Plays audio from the audio_buffer using sounddevice. + """ + try: + logging.info("Starting audio playback thread...") + with sd.OutputStream( + samplerate=self.audio_rate, + channels=1, + callback=self.play_audio_callback, + blocksize=4096, + dtype="float32", + ): + self.playback_finished.wait() + except Exception as e: + logging.error(f"Error during audio playback: {e}") + self.audio_killed = True + + def _split_text(self, text: str) -> List[str]: + # Simple sentence splitter based on punctuation. + sentences = re.split(r"(?<=[.!?]) +", text) + return sentences + + # Audio playback callback, called continuously to stream audio from the buffer + def play_audio_callback( + self, outdata: np.ndarray, frames: int, time_info, status: sd.CallbackFlags + ): + if self.audio_killed or ( + self.audio_started and self.audio_buffer.empty() and self.audio_stopped + ): + logging.error("AUDIO KILLED OR STOPPED OR BUFFER EMPTY") + self.playback_finished.set() + return + + if self.audio_buffer.empty(): + outdata.fill(0) + return + + n = 0 + while n < frames and not self.audio_buffer.empty(): + remaining = frames - n + current_chunk = self.audio_buffer.queue[0] + k = current_chunk.shape[0] + + if remaining <= k: + outdata[n:, 0] = current_chunk[:remaining] + self.audio_buffer.queue[0] = current_chunk[remaining:] + n = frames + if self.audio_buffer.queue[0].shape[0] == 0: + self.audio_buffer.get() + break + + outdata[n : n + k, 0] = self.audio_buffer.get() + n += k + + if n < frames: + outdata[n:, 0] = 0 \ No newline at end of file diff --git a/tts_wrapper/engines/witai/witai.py b/tts_wrapper/engines/witai/witai.py index a2dcfad1..0f95b4c0 100644 --- a/tts_wrapper/engines/witai/witai.py +++ b/tts_wrapper/engines/witai/witai.py @@ -1,12 +1,19 @@ from ...tts import AbstractTTS, FileFormat -from typing import Optional, List, Dict, Any +from typing import Any, List, Optional, Dict, Literal, Tuple, Generator from . import WitAiClient, WitAiSSML from ...engines.utils import ( estimate_word_timings, ) # Import the timing estimation function from ...exceptions import UnsupportedFileFormat +import re +import numpy as np +import pathlib import logging - +import threading +import queue +import sounddevice as sd +import time +from io import BytesIO class WitAiTTS(AbstractTTS): def __init__( @@ -20,6 +27,12 @@ def __init__( self._voice = voice self._lang = lang self.audio_rate = 24000 # Adjusted based on Wit.ai's 24kHz sample rate for PCM + self.audio_buffer = queue.Queue() + self.playback_finished = threading.Event() + self.audio_started = False + self.audio_stopped = False + self.audio_killed = False + self.audio_format = "wav" # Default format def synth_to_bytes(self, text: str) -> bytes: if not self._is_ssml(str(text)): @@ -50,3 +63,215 @@ def set_voice(self, voice_id: str, lang_id: str): def construct_prosody_tag(self, text: str) -> str: pass + + def speak_streamed( + self, + text: str, + save_to_file_path: Optional[str] = None, + audio_format: Optional[str] = "wav", + ) -> None: + """ + Synthesizes text and plays it back using sounddevice in a streaming fashion. + Optionally saves the audio to a file after playback completes. + + :param text: The text to synthesize and play. + :param save_to_file_path: Path to save the audio file (optional). + :param audio_format: Audio format to save (e.g., 'wav', 'mp3', 'flac'). + """ + logging.info( + "[WatsonTTS.speak_streamed] Starting speech synthesis and playback..." + ) + + # Reset flags + self.audio_started = False + self.audio_stopped = False + self.playback_finished.clear() + + # Open the output file if saving is required + output_file = None + if save_to_file_path: + output_file = open(save_to_file_path, "wb") + logging.info( + f"Saving audio to {save_to_file_path} in {audio_format} format." + ) + + try: + # Start audio playback in a separate thread + playback_thread = threading.Thread(target=self.play_audio) + playback_thread.start() + + # Iterate over the generator returned by synth_to_bytestream + for chunk_idx, audio_chunk in enumerate( + self.synth_to_bytestream(str(text), format=audio_format) + ): + logging.info( + f"Processing audio chunk {chunk_idx} with size {len(audio_chunk)} bytes" + ) + if audio_format.lower() == "wav": + # Convert bytes back to numpy float32 array for playback + # Assuming audio_chunk is raw PCM data (LINEAR16) + samples = ( + np.frombuffer(audio_chunk, dtype=np.int16).astype(np.float32) + / 32767.0 + ) + elif audio_format.lower() in ["mp3", "flac"]: + # For formats like MP3 or FLAC, you need to decode them back to PCM + # This requires additional processing which is not implemented here + # For simplicity, we'll skip playback for non-WAV formats + samples = None + logging.warning( + f"Playback for format '{audio_format}' is not implemented." + ) + else: + raise UnsupportedFileFormat(f"Unsupported format: {audio_format}") + + if samples is not None: + # Add audio samples to the buffer for streaming playback + self.audio_buffer.put(samples) + logging.info(f"Audio chunk {chunk_idx} added to buffer") + + # Write the chunk to the file if saving + if save_to_file_path: + output_file.write( + audio_chunk + ) # Corrected from f.write to output_file.write + + if not self.audio_started and samples is not None: + logging.info("Starting audio playback...") + self.audio_started = True + + # Signal that audio generation is complete + self.audio_stopped = True + + # Wait for playback to finish + playback_thread.join() + logging.info("Playback finished.") + + except Exception as e: + logging.error(f"Error during speak_streamed: {e}") + self.audio_killed = True + + finally: + if output_file: + output_file.close() + logging.info( + f"Audio successfully saved to {save_to_file_path} in {audio_format} format." + ) + + def synth_to_bytestream( + self, text: Any, format: Optional[str] = "wav" + ) -> Generator[bytes, None, None]: + """ + Synthesizes text to an in-memory bytestream in the specified audio format. + Yields audio data chunks as they are generated. + + :param text: The text to synthesize. + :param format: The desired audio format (e.g., 'wav', 'mp3', 'flac'). Defaults to 'wav'. + :return: A generator yielding bytes objects containing audio data. + """ + try: + logging.info(f"[WatsonTTS.synth_to_bytestream] Synthesizing text: {text}") + + # Split the text into smaller segments (e.g., sentences) for incremental synthesis + text_segments = self._split_text(text) + + for segment_idx, segment in enumerate(text_segments): + logging.info(f"Synthesizing segment {segment_idx}: {segment}") + #result = self._client.synth( + # str(segment), self._voice, self._lang, include_timepoints=True + #) + #audio_bytes = result["audio_content"] + audio_bytes = self.synth_to_bytes(str(segment)) + + if format.lower() == "wav": + # Yield raw PCM data (skip WAV header if necessary) + # Google TTS returns LINEAR16 PCM in WAV format + audio_stream = BytesIO(audio_bytes) + audio_stream.seek(44) # Skip the 44-byte WAV header + chunk_size = 1024 # Number of bytes per chunk + + while True: + chunk = audio_stream.read(chunk_size) + if not chunk: + break + yield chunk + + elif format.lower() in ["mp3", "flac"]: + # Convert PCM to the desired format using _convert_audio + pcm_data = np.frombuffer(audio_bytes, dtype=np.int16) + converted_audio = self._convert_audio( + pcm_data, format, self.audio_rate + ) + chunk_size = 4096 # Number of bytes per chunk + audio_io = BytesIO(converted_audio) + + while True: + chunk = audio_io.read(chunk_size) + if not chunk: + break + yield chunk + + else: + raise UnsupportedFileFormat(f"Unsupported format: {format}") + + except Exception as e: + logging.error(f"Error in synth_to_bytestream: {e}") + raise + + def play_audio(self): + """ + Plays audio from the audio_buffer using sounddevice. + """ + try: + logging.info("Starting audio playback thread...") + with sd.OutputStream( + samplerate=self.audio_rate, + channels=1, + callback=self.play_audio_callback, + blocksize=4096, + dtype="float32", + ): + self.playback_finished.wait() + except Exception as e: + logging.error(f"Error during audio playback: {e}") + self.audio_killed = True + + def _split_text(self, text: str) -> List[str]: + # Simple sentence splitter based on punctuation. + sentences = re.split(r"(?<=[.!?]) +", text) + return sentences + + # Audio playback callback, called continuously to stream audio from the buffer + def play_audio_callback( + self, outdata: np.ndarray, frames: int, time_info, status: sd.CallbackFlags + ): + if self.audio_killed or ( + self.audio_started and self.audio_buffer.empty() and self.audio_stopped + ): + logging.error("AUDIO KILLED OR STOPPED OR BUFFER EMPTY") + self.playback_finished.set() + return + + if self.audio_buffer.empty(): + outdata.fill(0) + return + + n = 0 + while n < frames and not self.audio_buffer.empty(): + remaining = frames - n + current_chunk = self.audio_buffer.queue[0] + k = current_chunk.shape[0] + + if remaining <= k: + outdata[n:, 0] = current_chunk[:remaining] + self.audio_buffer.queue[0] = current_chunk[remaining:] + n = frames + if self.audio_buffer.queue[0].shape[0] == 0: + self.audio_buffer.get() + break + + outdata[n : n + k, 0] = self.audio_buffer.get() + n += k + + if n < frames: + outdata[n:, 0] = 0 \ No newline at end of file diff --git a/tts_wrapper/tts.py b/tts_wrapper/tts.py index f132356e..8ef81f93 100644 --- a/tts_wrapper/tts.py +++ b/tts_wrapper/tts.py @@ -295,6 +295,7 @@ def callback(self, outdata, frames, time, status): # Each frame is 2 bytes for int16, # so frames * 2 gives the number of bytes end_position = self.position + frames * 2 + data = self.audio_bytes[self.position : end_position] if len(data) < frames * 2: # Not enough data to fill outdata, zero-pad it