From 5754a1b113acbf84caf005c71dfaa8c2d7fd0225 Mon Sep 17 00:00:00 2001 From: David vonThenen <12752197+dvonthenen@users.noreply.github.com> Date: Thu, 17 Oct 2024 17:00:31 -0700 Subject: [PATCH] Port Enhancements from EA Release --- deepgram/audio/microphone/microphone.py | 138 ++++++++++-------- deepgram/audio/speaker/speaker.py | 111 +++++++------- .../common/v1/abstract_async_websocket.py | 5 +- .../common/v1/abstract_sync_websocket.py | 5 +- .../speak/v1/websocket/async_client.py | 18 ++- deepgram/clients/speak/v1/websocket/client.py | 18 ++- .../{async_complete => async_simple}/main.py | 0 .../{complete => output_to_wav}/main.py | 72 +++------ .../text-to-speech/websocket/simple/main.py | 72 ++++++--- 9 files changed, 252 insertions(+), 187 deletions(-) rename examples/text-to-speech/websocket/{async_complete => async_simple}/main.py (100%) rename examples/text-to-speech/websocket/{complete => output_to_wav}/main.py (53%) diff --git a/deepgram/audio/microphone/microphone.py b/deepgram/audio/microphone/microphone.py index ae452984..91ba0df0 100644 --- a/deepgram/audio/microphone/microphone.py +++ b/deepgram/audio/microphone/microphone.py @@ -22,8 +22,8 @@ class Microphone: # pylint: disable=too-many-instance-attributes _logger: verboselogs.VerboseLogger - _audio: "pyaudio.PyAudio" - _stream: "pyaudio.Stream" + _audio: Optional["pyaudio.PyAudio"] = None + _stream: Optional["pyaudio.Stream"] = None _chunk: int _rate: int @@ -145,59 +145,31 @@ def start(self) -> bool: self._asyncio_thread = None self._push_callback = self._push_callback_org - self._stream = self._audio.open( - format=self._format, - channels=self._channels, - rate=self._rate, - input=True, - frames_per_buffer=self._chunk, - input_device_index=self._input_device_index, - stream_callback=self._callback, - ) + if self._audio is not None: + self._stream = self._audio.open( + format=self._format, + channels=self._channels, + rate=self._rate, + input=True, + output=False, + frames_per_buffer=self._chunk, + input_device_index=self._input_device_index, + stream_callback=self._callback, + ) + + if self._stream is None: + self._logger.error("start failed. No stream created.") + self._logger.debug("Microphone.start LEAVE") + return False self._exit.clear() - self._stream.start_stream() + if self._stream is not None: + self._stream.start_stream() self._logger.notice("start succeeded") self._logger.debug("Microphone.start LEAVE") return True - def _callback( - self, input_data, frame_count, time_info, status_flags - ): # pylint: disable=unused-argument - """ - The callback used to process data in callback mode. - """ - # dynamic import of pyaudio as not to force the requirements on the SDK (and users) - import pyaudio # pylint: disable=import-outside-toplevel - - self._logger.debug("Microphone._callback ENTER") - - if self._exit.is_set(): - self._logger.info("exit is Set") - self._logger.notice("_callback stopping...") - self._logger.debug("Microphone._callback LEAVE") - return None, pyaudio.paAbort - - if input_data is None: - self._logger.warning("input_data is None") - self._logger.debug("Microphone._callback LEAVE") - return None, pyaudio.paContinue - - try: - if self._is_muted: - size = len(input_data) - input_data = b"\x00" * size - - self._push_callback(input_data) - except Exception as e: - self._logger.error("Error while sending: %s", str(e)) - self._logger.debug("Microphone._callback LEAVE") - raise - - self._logger.debug("Microphone._callback LEAVE") - return input_data, pyaudio.paContinue - def mute(self) -> bool: """ mute - mutes the microphone stream @@ -205,17 +177,17 @@ def mute(self) -> bool: Returns: bool: True if the stream was muted, False otherwise """ - self._logger.debug("Microphone.mute ENTER") + self._logger.verbose("Microphone.mute ENTER") if self._stream is None: self._logger.error("mute failed. Library not initialized.") - self._logger.debug("Microphone.mute LEAVE") + self._logger.verbose("Microphone.mute LEAVE") return False self._is_muted = True self._logger.notice("mute succeeded") - self._logger.debug("Microphone.mute LEAVE") + self._logger.verbose("Microphone.mute LEAVE") return True def unmute(self) -> bool: @@ -225,19 +197,42 @@ def unmute(self) -> bool: Returns: bool: True if the stream was unmuted, False otherwise """ - self._logger.debug("Microphone.unmute ENTER") + self._logger.verbose("Microphone.unmute ENTER") if self._stream is None: self._logger.error("unmute failed. Library not initialized.") - self._logger.debug("Microphone.unmute LEAVE") + self._logger.verbose("Microphone.unmute LEAVE") return False self._is_muted = False self._logger.notice("unmute succeeded") - self._logger.debug("Microphone.unmute LEAVE") + self._logger.verbose("Microphone.unmute LEAVE") return True + def is_muted(self) -> bool: + """ + is_muted - returns the state of the stream + + Args: + None + + Returns: + True if the stream is muted, False otherwise + """ + self._logger.spam("Microphone.is_muted ENTER") + + if self._stream is None: + self._logger.spam("is_muted: stream is None") + self._logger.spam("Microphone.is_muted LEAVE") + return False + + val = self._is_muted + + self._logger.spam("is_muted: %s", val) + self._logger.spam("Microphone.is_muted LEAVE") + return val + def finish(self) -> bool: """ finish - stops the microphone stream @@ -255,7 +250,6 @@ def finish(self) -> bool: self._logger.notice("stopping stream...") self._stream.stop_stream() self._stream.close() - self._stream = None # type: ignore self._logger.notice("stream stopped") # clean up the thread @@ -265,13 +259,43 @@ def finish(self) -> bool: self._asyncio_thread is not None ): - self._logger.notice("stopping asyncio loop...") + self._logger.notice("stopping _asyncio_loop...") self._asyncio_loop.call_soon_threadsafe(self._asyncio_loop.stop) self._asyncio_thread.join() - self._asyncio_thread = None self._logger.notice("_asyncio_thread joined") + self._stream = None + self._asyncio_thread = None self._logger.notice("finish succeeded") self._logger.debug("Microphone.finish LEAVE") return True + + def _callback( + self, input_data, frame_count, time_info, status_flags + ): # pylint: disable=unused-argument + """ + The callback used to process data in callback mode. + """ + # dynamic import of pyaudio as not to force the requirements on the SDK (and users) + import pyaudio # pylint: disable=import-outside-toplevel + + if self._exit.is_set(): + self._logger.notice("_callback exit is Set. stopping...") + return None, pyaudio.paAbort + + if input_data is None: + self._logger.warning("input_data is None") + return None, pyaudio.paContinue + + try: + if self._is_muted: + size = len(input_data) + input_data = b"\x00" * size + + self._push_callback(input_data) + except Exception as e: + self._logger.error("Error while sending: %s", str(e)) + raise + + return input_data, pyaudio.paContinue diff --git a/deepgram/audio/speaker/speaker.py b/deepgram/audio/speaker/speaker.py index faee51c4..66bc2dba 100644 --- a/deepgram/audio/speaker/speaker.py +++ b/deepgram/audio/speaker/speaker.py @@ -15,6 +15,8 @@ from ...utils import verboselogs from .constants import LOGGING, CHANNELS, RATE, CHUNK, TIMEOUT, PLAYBACK_DELTA +from ..microphone import Microphone + if TYPE_CHECKING: import pyaudio @@ -28,7 +30,7 @@ class Speaker: # pylint: disable=too-many-instance-attributes _logger: verboselogs.VerboseLogger - _audio: "pyaudio.PyAudio" + _audio: Optional["pyaudio.PyAudio"] = None _stream: Optional["pyaudio.Stream"] = None _chunk: int @@ -56,6 +58,8 @@ class Speaker: # pylint: disable=too-many-instance-attributes _pull_callback_org: Optional[Callable] = None _pull_callback: Optional[Callable] = None + _microphone: Optional[Microphone] = None + def __init__( self, pull_callback: Optional[Callable] = None, @@ -66,6 +70,7 @@ def __init__( channels: int = CHANNELS, last_play_delta_in_ms: int = PLAYBACK_DELTA, output_device_index: Optional[int] = None, + microphone: Optional[Microphone] = None, ): # pylint: disable=too-many-positional-arguments # dynamic import of pyaudio as not to force the requirements on the SDK (and users) import pyaudio # pylint: disable=import-outside-toplevel @@ -80,6 +85,8 @@ def __init__( self._last_datagram = datetime.now() self._lock_wait = threading.Lock() + self._microphone = microphone + self._audio = pyaudio.PyAudio() self._chunk = chunk self._rate = rate @@ -117,10 +124,6 @@ def set_pull_callback(self, pull_callback: Callable) -> None: """ self._pull_callback_org = pull_callback - # def _start_asyncio_loop(self) -> None: - # self._asyncio_loop = asyncio.new_event_loop() - # self._asyncio_loop.run_forever() - def start(self, active_loop: Optional[asyncio.AbstractEventLoop] = None) -> bool: """ starts - starts the Speaker stream @@ -147,45 +150,25 @@ def start(self, active_loop: Optional[asyncio.AbstractEventLoop] = None) -> bool self._exit.clear() self._queue = queue.Queue() - self._stream = self._audio.open( - format=self._format, - channels=self._channels, - rate=self._rate, - input=False, - output=True, - frames_per_buffer=self._chunk, - output_device_index=self._output_device_index, - ) + if self._audio is not None: + self._stream = self._audio.open( + format=self._format, + channels=self._channels, + rate=self._rate, + input=False, + output=True, + frames_per_buffer=self._chunk, + output_device_index=self._output_device_index, + ) + + if self._stream is None: + self._logger.error("start failed. No stream created.") + self._logger.debug("Speaker.start LEAVE") + return False self._push_callback = self._push_callback_org self._pull_callback = self._pull_callback_org - # if inspect.iscoroutinefunction( - # self._push_callback_org - # ) or inspect.iscoroutinefunction(self._pull_callback_org): - # self._logger.verbose("Starting asyncio loop...") - # self._asyncio_thread = threading.Thread(target=self._start_asyncio_loop) - # self._asyncio_thread.start() - - # # determine if the push_callback is a coroutine - # if inspect.iscoroutinefunction(self._push_callback_org): - # self._logger.verbose("async/await push callback") - # self._push_callback = lambda data: asyncio.run_coroutine_threadsafe( - # self._push_callback_org(data), self._asyncio_loop - # ).result() - # else: - # self._logger.verbose("threaded push callback") - # self._push_callback = self._push_callback_org - - # if inspect.iscoroutinefunction(self._pull_callback_org): - # self._logger.verbose("async/await pull callback") - # self._pull_callback = lambda: asyncio.run_coroutine_threadsafe( - # self._pull_callback_org(), self._asyncio_loop - # ).result() - # else: - # self._logger.verbose("threaded pull callback") - # self._pull_callback = self._pull_callback_org - # start the play thread self._thread = threading.Thread( target=self._play, args=(self._queue, self._stream, self._exit), daemon=True @@ -193,7 +176,8 @@ def start(self, active_loop: Optional[asyncio.AbstractEventLoop] = None) -> bool self._thread.start() # Start the stream - self._stream.start_stream() + if self._stream is not None: + self._stream.start_stream() # Start the receiver thread within the start function self._logger.verbose("Starting receiver thread...") @@ -205,6 +189,18 @@ def start(self, active_loop: Optional[asyncio.AbstractEventLoop] = None) -> bool return True + def wait_for_complete_with_mute(self, mic: Microphone): + """ + This method will mute/unmute a Microphone and block until the speak is done playing sound. + """ + self._logger.debug("Speaker.wait_for_complete ENTER") + + mic.mute() + self.wait_for_complete() + mic.unmute() + + self._logger.debug("Speaker.wait_for_complete LEAVE") + def wait_for_complete(self): """ This method will block until the speak is done playing sound. @@ -328,29 +324,24 @@ def finish(self) -> bool: self._logger.notice("stopping stream...") self._stream.stop_stream() self._stream.close() - self._stream = None self._logger.notice("stream stopped") if self._thread is not None: - self._logger.notice("joining thread...") + self._logger.notice("joining _thread...") self._thread.join() - self._thread = None self._logger.notice("thread stopped") - # if self._asyncio_thread is not None: - # self._logger.notice("stopping asyncio loop...") - # self._asyncio_loop.call_soon_threadsafe(self._asyncio_loop.stop) - # self._asyncio_thread.join() - # self._asyncio_thread = None - # self._logger.notice("_asyncio_thread joined") - if self._receiver_thread is not None: - self._logger.notice("stopping asyncio loop...") + self._logger.notice("stopping _receiver_thread...") self._receiver_thread.join() - self._receiver_thread = None self._logger.notice("_receiver_thread joined") - self._queue = None # type: ignore + with self._queue.mutex: + self._queue.queue.clear() + + self._stream = None + self._thread = None + self._receiver_thread = None self._logger.notice("finish succeeded") self._logger.debug("Speaker.finish LEAVE") @@ -363,9 +354,21 @@ def _play(self, audio_out, stream, stop): """ while not stop.is_set(): try: + if self._microphone is not None and self._microphone.is_muted(): + with self._lock_wait: + delta = datetime.now() - self._last_datagram + diff_in_ms = delta.total_seconds() * 1000 + if diff_in_ms > float(self._last_play_delta_in_ms): + self._logger.debug( + "LastPlay delta is greater than threshold. Unmute!" + ) + self._microphone.unmute() data = audio_out.get(True, TIMEOUT) with self._lock_wait: self._last_datagram = datetime.now() + if self._microphone is not None and not self._microphone.is_muted(): + self._logger.debug("New speaker sound detected. Mute!") + self._microphone.mute() stream.write(data) except queue.Empty: pass diff --git a/deepgram/clients/common/v1/abstract_async_websocket.py b/deepgram/clients/common/v1/abstract_async_websocket.py index 8a35a98d..c24253da 100644 --- a/deepgram/clients/common/v1/abstract_async_websocket.py +++ b/deepgram/clients/common/v1/abstract_async_websocket.py @@ -448,7 +448,10 @@ async def finish(self) -> bool: # debug the threads for thread in threading.enumerate(): - self._logger.debug("after running thread: %s", thread.name) + if thread is not None and thread.name is not None: + self._logger.debug("after running thread: %s", thread.name) + else: + self._logger.debug("after running thread: unknown_thread_name") self._logger.debug("number of active threads: %s", threading.active_count()) self._logger.notice("finish succeeded") diff --git a/deepgram/clients/common/v1/abstract_sync_websocket.py b/deepgram/clients/common/v1/abstract_sync_websocket.py index b29e9e1d..30d19504 100644 --- a/deepgram/clients/common/v1/abstract_sync_websocket.py +++ b/deepgram/clients/common/v1/abstract_sync_websocket.py @@ -445,7 +445,10 @@ def finish(self) -> bool: # debug the threads for thread in threading.enumerate(): - self._logger.debug("before running thread: %s", thread.name) + if thread is not None and thread.name is not None: + self._logger.debug("before running thread: %s", thread.name) + else: + self._logger.debug("after running thread: unknown_thread_name") self._logger.debug("number of active threads: %s", threading.active_count()) self._logger.notice("finish succeeded") diff --git a/deepgram/clients/speak/v1/websocket/async_client.py b/deepgram/clients/speak/v1/websocket/async_client.py index c5c1cb16..b7965c69 100644 --- a/deepgram/clients/speak/v1/websocket/async_client.py +++ b/deepgram/clients/speak/v1/websocket/async_client.py @@ -27,6 +27,7 @@ ) from .options import SpeakWSOptions +from .....audio.microphone import Microphone from .....audio.speaker import Speaker, RATE, CHANNELS, PLAYBACK_DELTA ONE_SECOND = 1 @@ -62,9 +63,13 @@ class AsyncSpeakWSClient( _options: Optional[Dict] = None _headers: Optional[Dict] = None + _speaker_created: bool = False _speaker: Optional[Speaker] = None + _microphone: Optional[Microphone] = None - def __init__(self, config: DeepgramClientOptions): + def __init__( + self, config: DeepgramClientOptions, microphone: Optional[Microphone] = None + ): if config is None: raise DeepgramError("Config is required") self._logger = verboselogs.VerboseLogger(__name__) @@ -80,6 +85,9 @@ def __init__(self, config: DeepgramClientOptions): self._last_datagram = None self._flush_count = 0 + # microphone + self._microphone = microphone + # init handlers self._event_handlers = { event: [] for event in SpeakWebSocketEvents.__members__.values() @@ -104,6 +112,8 @@ def __init__(self, config: DeepgramClientOptions): self._logger.debug("channels: %s", channels) self._logger.debug("device_index: %s", device_index) + self._speaker_created = True + if device_index is not None: self._speaker = Speaker( rate=rate, @@ -111,6 +121,7 @@ def __init__(self, config: DeepgramClientOptions): last_play_delta_in_ms=playback_delta_in_ms, verbose=self._config.verbose, output_device_index=device_index, + microphone=self._microphone, ) else: self._speaker = Speaker( @@ -118,6 +129,7 @@ def __init__(self, config: DeepgramClientOptions): channels=channels, last_play_delta_in_ms=playback_delta_in_ms, verbose=self._config.verbose, + microphone=self._microphone, ) # call the parent constructor @@ -628,6 +640,10 @@ async def finish(self) -> bool: if await super().finish() is False: self._logger.error("AsyncListenWebSocketClient.finish failed") + if self._speaker is not None and self._speaker_created: + self._speaker.finish() + self._speaker_created = False + # Before cancelling, check if the tasks were created # debug the threads for thread in threading.enumerate(): diff --git a/deepgram/clients/speak/v1/websocket/client.py b/deepgram/clients/speak/v1/websocket/client.py index f334bb9b..05580bfa 100644 --- a/deepgram/clients/speak/v1/websocket/client.py +++ b/deepgram/clients/speak/v1/websocket/client.py @@ -27,6 +27,7 @@ ) from .options import SpeakWSOptions +from .....audio.microphone import Microphone from .....audio.speaker import Speaker, RATE, CHANNELS, PLAYBACK_DELTA ONE_SECOND = 1 @@ -63,9 +64,13 @@ class SpeakWSClient( _options: Optional[Dict] = None _headers: Optional[Dict] = None + _speaker_created: bool = False _speaker: Optional[Speaker] = None + _microphone: Optional[Microphone] = None - def __init__(self, config: DeepgramClientOptions): + def __init__( + self, config: DeepgramClientOptions, microphone: Optional[Microphone] = None + ): if config is None: raise DeepgramError("Config is required") @@ -83,6 +88,9 @@ def __init__(self, config: DeepgramClientOptions): self._last_datagram = None self._flush_count = 0 + # microphone + self._microphone = microphone + # init handlers self._event_handlers = { event: [] for event in SpeakWebSocketEvents.__members__.values() @@ -107,6 +115,8 @@ def __init__(self, config: DeepgramClientOptions): self._logger.debug("channels: %s", channels) self._logger.debug("device_index: %s", device_index) + self._speaker_created = True + if device_index is not None: self._speaker = Speaker( rate=rate, @@ -114,6 +124,7 @@ def __init__(self, config: DeepgramClientOptions): last_play_delta_in_ms=playback_delta_in_ms, verbose=self._config.verbose, output_device_index=device_index, + microphone=self._microphone, ) else: self._speaker = Speaker( @@ -121,6 +132,7 @@ def __init__(self, config: DeepgramClientOptions): channels=channels, last_play_delta_in_ms=playback_delta_in_ms, verbose=self._config.verbose, + microphone=self._microphone, ) # call the parent constructor @@ -624,6 +636,10 @@ def finish(self) -> bool: if super().finish() is False: self._logger.error("ListenWebSocketClient.finish failed") + if self._speaker is not None and self._speaker_created: + self._speaker.finish() + self._speaker_created = False + # debug the threads for thread in threading.enumerate(): self._logger.debug("before running thread: %s", thread.name) diff --git a/examples/text-to-speech/websocket/async_complete/main.py b/examples/text-to-speech/websocket/async_simple/main.py similarity index 100% rename from examples/text-to-speech/websocket/async_complete/main.py rename to examples/text-to-speech/websocket/async_simple/main.py diff --git a/examples/text-to-speech/websocket/complete/main.py b/examples/text-to-speech/websocket/output_to_wav/main.py similarity index 53% rename from examples/text-to-speech/websocket/complete/main.py rename to examples/text-to-speech/websocket/output_to_wav/main.py index 612ae7da..b5a5629b 100644 --- a/examples/text-to-speech/websocket/complete/main.py +++ b/examples/text-to-speech/websocket/output_to_wav/main.py @@ -4,7 +4,7 @@ import time from deepgram.utils import verboselogs - +import wave from deepgram import ( DeepgramClient, @@ -13,23 +13,20 @@ SpeakWSOptions, ) -TTS_TEXT = "Hello, this is a text to speech example using Deepgram." - -global warning_notice -warning_notice = True +AUDIO_FILE = "output.wav" +TTS_TEXT = "Hello, this is a text to speech example using Deepgram. How are you doing today? I am fine thanks for asking." def main(): try: # example of setting up a client config. logging values: WARNING, VERBOSE, DEBUG, SPAM - config: DeepgramClientOptions = DeepgramClientOptions( - options={ - # "auto_flush_speak_delta": "500", - "speaker_playback": "true", - }, - # verbose=verboselogs.DEBUG, - ) - deepgram: DeepgramClient = DeepgramClient("", config) + # config: DeepgramClientOptions = DeepgramClientOptions( + # # options={"auto_flush_speak_delta": "500", "speaker_playback": "true"}, + # verbose=verboselogs.SPAM, + # ) + # deepgram: DeepgramClient = DeepgramClient("", config) + # otherwise, use default config + deepgram: DeepgramClient = DeepgramClient() # Create a websocket connection to Deepgram dg_connection = deepgram.speak.websocket.v("1") @@ -38,46 +35,25 @@ def on_open(self, open, **kwargs): print(f"\n\n{open}\n\n") def on_binary_data(self, data, **kwargs): - global warning_notice - if warning_notice: - print("Received binary data") - print("You can do something with the binary data here") - print("OR") - print( - "If you want to simply play the audio, set speaker_playback to true in the options for DeepgramClientOptions" - ) - warning_notice = False - - def on_metadata(self, metadata, **kwargs): - print(f"\n\n{metadata}\n\n") - - def on_flush(self, flushed, **kwargs): - print(f"\n\n{flushed}\n\n") - - def on_clear(self, clear, **kwargs): - print(f"\n\n{clear}\n\n") + print("Received binary data") + with open(AUDIO_FILE, "ab") as f: + f.write(data) + f.flush() def on_close(self, close, **kwargs): print(f"\n\n{close}\n\n") - def on_warning(self, warning, **kwargs): - print(f"\n\n{warning}\n\n") - - def on_error(self, error, **kwargs): - print(f"\n\n{error}\n\n") - - def on_unhandled(self, unhandled, **kwargs): - print(f"\n\n{unhandled}\n\n") - dg_connection.on(SpeakWebSocketEvents.Open, on_open) dg_connection.on(SpeakWebSocketEvents.AudioData, on_binary_data) - dg_connection.on(SpeakWebSocketEvents.Metadata, on_metadata) - dg_connection.on(SpeakWebSocketEvents.Flushed, on_flush) - dg_connection.on(SpeakWebSocketEvents.Cleared, on_clear) dg_connection.on(SpeakWebSocketEvents.Close, on_close) - dg_connection.on(SpeakWebSocketEvents.Error, on_error) - dg_connection.on(SpeakWebSocketEvents.Warning, on_warning) - dg_connection.on(SpeakWebSocketEvents.Unhandled, on_unhandled) + + # Generate a generic WAV container header + # since we don't support containerized audio, we need to generate a header + header = wave.open(AUDIO_FILE, "wb") + header.setnchannels(1) # Mono audio + header.setsampwidth(2) # 16-bit audio + header.setframerate(16000) # Sample rate of 16000 Hz + header.close() # connect to websocket options = SpeakWSOptions( @@ -93,13 +69,11 @@ def on_unhandled(self, unhandled, **kwargs): # send the text to Deepgram dg_connection.send_text(TTS_TEXT) - # if auto_flush_speak_delta is not used, you must flush the connection by calling flush() dg_connection.flush() # Indicate that we've finished - dg_connection.wait_for_complete() - + time.sleep(7) print("\n\nPress Enter to stop...\n\n") input() diff --git a/examples/text-to-speech/websocket/simple/main.py b/examples/text-to-speech/websocket/simple/main.py index b5a5629b..612ae7da 100644 --- a/examples/text-to-speech/websocket/simple/main.py +++ b/examples/text-to-speech/websocket/simple/main.py @@ -4,7 +4,7 @@ import time from deepgram.utils import verboselogs -import wave + from deepgram import ( DeepgramClient, @@ -13,20 +13,23 @@ SpeakWSOptions, ) -AUDIO_FILE = "output.wav" -TTS_TEXT = "Hello, this is a text to speech example using Deepgram. How are you doing today? I am fine thanks for asking." +TTS_TEXT = "Hello, this is a text to speech example using Deepgram." + +global warning_notice +warning_notice = True def main(): try: # example of setting up a client config. logging values: WARNING, VERBOSE, DEBUG, SPAM - # config: DeepgramClientOptions = DeepgramClientOptions( - # # options={"auto_flush_speak_delta": "500", "speaker_playback": "true"}, - # verbose=verboselogs.SPAM, - # ) - # deepgram: DeepgramClient = DeepgramClient("", config) - # otherwise, use default config - deepgram: DeepgramClient = DeepgramClient() + config: DeepgramClientOptions = DeepgramClientOptions( + options={ + # "auto_flush_speak_delta": "500", + "speaker_playback": "true", + }, + # verbose=verboselogs.DEBUG, + ) + deepgram: DeepgramClient = DeepgramClient("", config) # Create a websocket connection to Deepgram dg_connection = deepgram.speak.websocket.v("1") @@ -35,25 +38,46 @@ def on_open(self, open, **kwargs): print(f"\n\n{open}\n\n") def on_binary_data(self, data, **kwargs): - print("Received binary data") - with open(AUDIO_FILE, "ab") as f: - f.write(data) - f.flush() + global warning_notice + if warning_notice: + print("Received binary data") + print("You can do something with the binary data here") + print("OR") + print( + "If you want to simply play the audio, set speaker_playback to true in the options for DeepgramClientOptions" + ) + warning_notice = False + + def on_metadata(self, metadata, **kwargs): + print(f"\n\n{metadata}\n\n") + + def on_flush(self, flushed, **kwargs): + print(f"\n\n{flushed}\n\n") + + def on_clear(self, clear, **kwargs): + print(f"\n\n{clear}\n\n") def on_close(self, close, **kwargs): print(f"\n\n{close}\n\n") + def on_warning(self, warning, **kwargs): + print(f"\n\n{warning}\n\n") + + def on_error(self, error, **kwargs): + print(f"\n\n{error}\n\n") + + def on_unhandled(self, unhandled, **kwargs): + print(f"\n\n{unhandled}\n\n") + dg_connection.on(SpeakWebSocketEvents.Open, on_open) dg_connection.on(SpeakWebSocketEvents.AudioData, on_binary_data) + dg_connection.on(SpeakWebSocketEvents.Metadata, on_metadata) + dg_connection.on(SpeakWebSocketEvents.Flushed, on_flush) + dg_connection.on(SpeakWebSocketEvents.Cleared, on_clear) dg_connection.on(SpeakWebSocketEvents.Close, on_close) - - # Generate a generic WAV container header - # since we don't support containerized audio, we need to generate a header - header = wave.open(AUDIO_FILE, "wb") - header.setnchannels(1) # Mono audio - header.setsampwidth(2) # 16-bit audio - header.setframerate(16000) # Sample rate of 16000 Hz - header.close() + dg_connection.on(SpeakWebSocketEvents.Error, on_error) + dg_connection.on(SpeakWebSocketEvents.Warning, on_warning) + dg_connection.on(SpeakWebSocketEvents.Unhandled, on_unhandled) # connect to websocket options = SpeakWSOptions( @@ -69,11 +93,13 @@ def on_close(self, close, **kwargs): # send the text to Deepgram dg_connection.send_text(TTS_TEXT) + # if auto_flush_speak_delta is not used, you must flush the connection by calling flush() dg_connection.flush() # Indicate that we've finished - time.sleep(7) + dg_connection.wait_for_complete() + print("\n\nPress Enter to stop...\n\n") input()