vocodedev · parshvadaftari · Aug 3, 2024 · Aug 6, 2024 · Aug 6, 2024 · Aug 6, 2024
diff --git a/quickstarts/streaming_conversation.py b/quickstarts/streaming_conversation.py
@@ -8,13 +8,25 @@
 from vocode.streaming.agent.chat_gpt_agent import ChatGPTAgent
 from vocode.streaming.models.agent import ChatGPTAgentConfig
 from vocode.streaming.models.message import BaseMessage
-from vocode.streaming.models.synthesizer import AzureSynthesizerConfig
+
+# Import LMNT synthesizer
+# from vocode.streaming.synthesizer.eleven_labs_websocket_synthesizer import ElevenLabsWSSynthesizer
+# from vocode.streaming.models.synthesizer import ElevenLabsSynthesizerConfig
+# from vocode.streaming.models.synthesizer import AzureSynthesizerConfig
+from vocode.streaming.models.synthesizer import (
+    AudioEncoding,
+    GoogleSynthesizerConfig,
+    LMNTSynthesizerConfig,
+)
 from vocode.streaming.models.transcriber import (
     DeepgramTranscriberConfig,
     PunctuationEndpointingConfig,
 )
 from vocode.streaming.streaming_conversation import StreamingConversation
-from vocode.streaming.synthesizer.azure_synthesizer import AzureSynthesizer
+from vocode.streaming.synthesizer.lmnt_synthesizer import LMNTSynthesizer
+
+# from vocode.streaming.synthesizer.azure_synthesizer import AzureSynthesizer
+# from vocode.streaming.synthesizer.google_synthesizer import GoogleSynthesizer
 from vocode.streaming.transcriber.deepgram_transcriber import DeepgramTranscriber
 
 configure_pretty_logging()
@@ -26,11 +38,9 @@ class Settings(BaseSettings):
     These parameters can be configured with environment variables.
     """
 
-    openai_api_key: str = "ENTER_YOUR_OPENAI_API_KEY_HERE"
-    azure_speech_key: str = "ENTER_YOUR_AZURE_KEY_HERE"
-    deepgram_api_key: str = "ENTER_YOUR_DEEPGRAM_API_KEY_HERE"
-
-    azure_speech_region: str = "eastus"
+    openai_api_key: str = "YOUR_OPENAI_API_KEY"
+    deepgram_api_key: str = "YOUR_DEEPGRAM_API_KEY"
+    lmnt_api_key: str = "YOUR_LMNT_API_KEY"
 
     # This means a .env file can be used to overload these settings
     # ex: "OPENAI_API_KEY=my_key" will set openai_api_key over the default above
@@ -68,10 +78,15 @@ async def main():
                 prompt_preamble="""The AI is having a pleasant conversation about life""",
             )
         ),
-        synthesizer=AzureSynthesizer(
-            AzureSynthesizerConfig.from_output_device(speaker_output),
-            azure_speech_key=settings.azure_speech_key,
-            azure_speech_region=settings.azure_speech_region,
+        synthesizer=LMNTSynthesizer(
+            LMNTSynthesizerConfig(
+                api_key=settings.lmnt_api_key,
+                voice_id="lily",
+                stability=0.5,
+                similarity_boost=0.75,
+                sampling_rate=16000,
+                audio_encoding=AudioEncoding.LINEAR16,
+            )
         ),
     )
     await conversation.start()

diff --git a/vocode/streaming/models/synthesizer.py b/vocode/streaming/models/synthesizer.py
@@ -25,6 +25,7 @@ class SynthesizerType(str, Enum):
     BARK = "synthesizer_bark"
     POLLY = "synthesizer_polly"
     CARTESIA = "synthesizer_cartesia"
+    LMNT = "synthesizer_lmnt"
 
 
 class SentimentConfig(BaseModel):
@@ -245,3 +246,21 @@ class CartesiaSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.CARTESIA
     model_id: str = DEFAULT_CARTESIA_MODEL_ID
     voice_id: str = DEFAULT_CARTESIA_VOICE_ID
     experimental_voice_controls: Optional[CartesiaVoiceControls] = None
+
+
+class LMNTSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.LMNT.value):
+    api_key: Optional[str] = None
+    voice_id: Optional[str] = "lily"
+    stability: Optional[float] = None
+    similarity_boost: Optional[float] = None
+
+    @validator("voice_id")
+    def set_default_voice_id(cls, voice_id):
+        return voice_id or "lily"
+
+    @validator("similarity_boost", always=True)
+    def stability_and_similarity_boost_check(cls, similarity_boost, values):
+        stability = values.get("stability")
+        if (stability is None) != (similarity_boost is None):
+            raise ValueError("Both stability and similarity_boost must be set or not set.")
+        return similarity_boost
diff --git a/vocode/streaming/synthesizer/lmnt_synthesizer.py b/vocode/streaming/synthesizer/lmnt_synthesizer.py
@@ -0,0 +1,118 @@
+import asyncio
+import base64
+import hashlib
+from typing import Optional
+
+import aiohttp
+from loguru import logger
+
+from vocode.streaming.models.audio import AudioEncoding, SamplingRate
+from vocode.streaming.models.message import BaseMessage
+from vocode.streaming.models.synthesizer import LMNTSynthesizerConfig
+from vocode.streaming.synthesizer.base_synthesizer import BaseSynthesizer, SynthesisResult
+from vocode.streaming.utils.create_task import asyncio_create_task
+
+LMNT_BASE_URL = "https://api.lmnt.com/v1"
+STREAMED_CHUNK_SIZE = 16000 * 2 // 4  # 1/8 of a second of 16kHz audio with 16-bit samples
+
+
+class LMNTSynthesizer(BaseSynthesizer[LMNTSynthesizerConfig]):
+    def __init__(
+        self,
+        synthesizer_config: LMNTSynthesizerConfig,
+    ):
+        super().__init__(synthesizer_config)
+
+        assert synthesizer_config.api_key is not None, "API key must be set"
+        assert synthesizer_config.voice_id is not None, "Voice ID must be set"
+        self.api_key = synthesizer_config.api_key
+
+        self.voice_id = synthesizer_config.voice_id
+        self.stability = synthesizer_config.stability
+        self.similarity_boost = synthesizer_config.similarity_boost
+        self.sample_rate = self.synthesizer_config.sampling_rate
+
+        if self.synthesizer_config.audio_encoding == AudioEncoding.LINEAR16:
+            self.output_format = "pcm_16000"  # Update as per LMNT specifications
+        elif self.synthesizer_config.audio_encoding == AudioEncoding.MULAW:
+            self.output_format = "ulaw_8000"
+        else:
+            raise ValueError(
+                f"Unsupported audio encoding: {self.synthesizer_config.audio_encoding}"
+            )
+
+        self.session = aiohttp.ClientSession()
+
+    async def create_speech_uncached(
+        self,
+        message: BaseMessage,
+        chunk_size: int,
+        is_first_text_chunk: bool = False,
+        is_sole_text_chunk: bool = False,
+    ) -> SynthesisResult:
+        self.total_chars += len(message.text)
+        url = f"{LMNT_BASE_URL}/ai/speech"
+        headers = {"X-API-Key": self.api_key}
+        body = {
+            "text": message.text,
+            "voice": self.voice_id,
+            # "stability": self.stability,
+            # "similarity_boost": self.similarity_boost,
+        }
+
+        # Debugging output
+        logger.debug(f"Sending request to {url} with headers {headers} and body {body}")
+
+        # Verify that the required fields are present
+        assert body["text"], "Text must not be empty"
+        assert body["voice"], "Voice ID must not be empty"
+
+        chunk_queue: asyncio.Queue[Optional[bytes]] = asyncio.Queue()
+        asyncio_create_task(
+            self.get_chunks(url, headers, body, chunk_size, chunk_queue),
+        )
+
+        return SynthesisResult(
+            self.chunk_result_generator_from_queue(chunk_queue),
+            lambda seconds: self.get_message_cutoff_from_voice_speed(message, seconds, 150),
+        )
+
+    @classmethod
+    def get_voice_identifier(cls, synthesizer_config: LMNTSynthesizerConfig):
+        hashed_api_key = hashlib.sha256(f"{synthesizer_config.api_key}".encode("utf-8")).hexdigest()
+        return ":".join(
+            (
+                "lmnt",
+                hashed_api_key,
+                str(synthesizer_config.voice_id),
+                str(synthesizer_config.stability),
+                str(synthesizer_config.similarity_boost),
+                synthesizer_config.audio_encoding,
+            )
+        )
+
+    async def get_chunks(
+        self,
+        url: str,
+        headers: dict,
+        body: dict,
+        chunk_size: int,
+        chunk_queue: asyncio.Queue[Optional[bytes]],
+    ):
+        try:
+            async with self.session.post(url, headers=headers, data=body) as resp:
+                if resp.status != 200:
+                    logger.error(f"LMNT API failed: {resp.status} {await resp.text()}")
+                    raise Exception(f"LMNT API returned {resp.status} status code")
+
+                data = await resp.json()
+                audio = base64.b64decode(data["audio"])
+                chunk_queue.put_nowait(audio)
+
+        except asyncio.CancelledError:
+            pass
+        finally:
+            chunk_queue.put_nowait(None)  # treated as sentinel
+
+    async def close(self):
+        await self.session.close()