Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Lmnt synthesizer #677

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 26 additions & 11 deletions quickstarts/streaming_conversation.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,25 @@
from vocode.streaming.agent.chat_gpt_agent import ChatGPTAgent
from vocode.streaming.models.agent import ChatGPTAgentConfig
from vocode.streaming.models.message import BaseMessage
from vocode.streaming.models.synthesizer import AzureSynthesizerConfig

# Import LMNT synthesizer
# from vocode.streaming.synthesizer.eleven_labs_websocket_synthesizer import ElevenLabsWSSynthesizer
# from vocode.streaming.models.synthesizer import ElevenLabsSynthesizerConfig
# from vocode.streaming.models.synthesizer import AzureSynthesizerConfig
from vocode.streaming.models.synthesizer import (
AudioEncoding,
GoogleSynthesizerConfig,
LMNTSynthesizerConfig,
)
from vocode.streaming.models.transcriber import (
DeepgramTranscriberConfig,
PunctuationEndpointingConfig,
)
from vocode.streaming.streaming_conversation import StreamingConversation
from vocode.streaming.synthesizer.azure_synthesizer import AzureSynthesizer
from vocode.streaming.synthesizer.lmnt_synthesizer import LMNTSynthesizer

# from vocode.streaming.synthesizer.azure_synthesizer import AzureSynthesizer
# from vocode.streaming.synthesizer.google_synthesizer import GoogleSynthesizer
from vocode.streaming.transcriber.deepgram_transcriber import DeepgramTranscriber

configure_pretty_logging()
Expand All @@ -26,11 +38,9 @@ class Settings(BaseSettings):
These parameters can be configured with environment variables.
"""

openai_api_key: str = "ENTER_YOUR_OPENAI_API_KEY_HERE"
azure_speech_key: str = "ENTER_YOUR_AZURE_KEY_HERE"
deepgram_api_key: str = "ENTER_YOUR_DEEPGRAM_API_KEY_HERE"

azure_speech_region: str = "eastus"
openai_api_key: str = "YOUR_OPENAI_API_KEY"
deepgram_api_key: str = "YOUR_DEEPGRAM_API_KEY"
lmnt_api_key: str = "YOUR_LMNT_API_KEY"

# This means a .env file can be used to overload these settings
# ex: "OPENAI_API_KEY=my_key" will set openai_api_key over the default above
Expand Down Expand Up @@ -68,10 +78,15 @@ async def main():
prompt_preamble="""The AI is having a pleasant conversation about life""",
)
),
synthesizer=AzureSynthesizer(
AzureSynthesizerConfig.from_output_device(speaker_output),
azure_speech_key=settings.azure_speech_key,
azure_speech_region=settings.azure_speech_region,
synthesizer=LMNTSynthesizer(
LMNTSynthesizerConfig(
api_key=settings.lmnt_api_key,
voice_id="lily",
stability=0.5,
similarity_boost=0.75,
sampling_rate=16000,
audio_encoding=AudioEncoding.LINEAR16,
)
),
)
await conversation.start()
Expand Down
19 changes: 19 additions & 0 deletions vocode/streaming/models/synthesizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ class SynthesizerType(str, Enum):
BARK = "synthesizer_bark"
POLLY = "synthesizer_polly"
CARTESIA = "synthesizer_cartesia"
LMNT = "synthesizer_lmnt"


class SentimentConfig(BaseModel):
Expand Down Expand Up @@ -245,3 +246,21 @@ class CartesiaSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.CARTESIA
model_id: str = DEFAULT_CARTESIA_MODEL_ID
voice_id: str = DEFAULT_CARTESIA_VOICE_ID
experimental_voice_controls: Optional[CartesiaVoiceControls] = None


class LMNTSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.LMNT.value):
api_key: Optional[str] = None
voice_id: Optional[str] = "lily"
stability: Optional[float] = None
similarity_boost: Optional[float] = None

@validator("voice_id")
def set_default_voice_id(cls, voice_id):
return voice_id or "lily"

@validator("similarity_boost", always=True)
def stability_and_similarity_boost_check(cls, similarity_boost, values):
stability = values.get("stability")
if (stability is None) != (similarity_boost is None):
raise ValueError("Both stability and similarity_boost must be set or not set.")
return similarity_boost
118 changes: 118 additions & 0 deletions vocode/streaming/synthesizer/lmnt_synthesizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
import asyncio
import base64
import hashlib
from typing import Optional

import aiohttp
from loguru import logger

from vocode.streaming.models.audio import AudioEncoding, SamplingRate
from vocode.streaming.models.message import BaseMessage
from vocode.streaming.models.synthesizer import LMNTSynthesizerConfig
from vocode.streaming.synthesizer.base_synthesizer import BaseSynthesizer, SynthesisResult
from vocode.streaming.utils.create_task import asyncio_create_task

LMNT_BASE_URL = "https://api.lmnt.com/v1"
STREAMED_CHUNK_SIZE = 16000 * 2 // 4 # 1/8 of a second of 16kHz audio with 16-bit samples


class LMNTSynthesizer(BaseSynthesizer[LMNTSynthesizerConfig]):
def __init__(
self,
synthesizer_config: LMNTSynthesizerConfig,
):
super().__init__(synthesizer_config)

assert synthesizer_config.api_key is not None, "API key must be set"
assert synthesizer_config.voice_id is not None, "Voice ID must be set"
self.api_key = synthesizer_config.api_key

self.voice_id = synthesizer_config.voice_id
self.stability = synthesizer_config.stability
self.similarity_boost = synthesizer_config.similarity_boost
self.sample_rate = self.synthesizer_config.sampling_rate

if self.synthesizer_config.audio_encoding == AudioEncoding.LINEAR16:
self.output_format = "pcm_16000" # Update as per LMNT specifications
elif self.synthesizer_config.audio_encoding == AudioEncoding.MULAW:
self.output_format = "ulaw_8000"
else:
raise ValueError(
f"Unsupported audio encoding: {self.synthesizer_config.audio_encoding}"
)

self.session = aiohttp.ClientSession()

async def create_speech_uncached(
self,
message: BaseMessage,
chunk_size: int,
is_first_text_chunk: bool = False,
is_sole_text_chunk: bool = False,
) -> SynthesisResult:
self.total_chars += len(message.text)
url = f"{LMNT_BASE_URL}/ai/speech"
headers = {"X-API-Key": self.api_key}
body = {
"text": message.text,
"voice": self.voice_id,
# "stability": self.stability,
# "similarity_boost": self.similarity_boost,
}

# Debugging output
logger.debug(f"Sending request to {url} with headers {headers} and body {body}")

# Verify that the required fields are present
assert body["text"], "Text must not be empty"
assert body["voice"], "Voice ID must not be empty"

chunk_queue: asyncio.Queue[Optional[bytes]] = asyncio.Queue()
asyncio_create_task(
self.get_chunks(url, headers, body, chunk_size, chunk_queue),
)

return SynthesisResult(
self.chunk_result_generator_from_queue(chunk_queue),
lambda seconds: self.get_message_cutoff_from_voice_speed(message, seconds, 150),
)

@classmethod
def get_voice_identifier(cls, synthesizer_config: LMNTSynthesizerConfig):
hashed_api_key = hashlib.sha256(f"{synthesizer_config.api_key}".encode("utf-8")).hexdigest()
return ":".join(
(
"lmnt",
hashed_api_key,
str(synthesizer_config.voice_id),
str(synthesizer_config.stability),
str(synthesizer_config.similarity_boost),
synthesizer_config.audio_encoding,
)
)

async def get_chunks(
self,
url: str,
headers: dict,
body: dict,
chunk_size: int,
chunk_queue: asyncio.Queue[Optional[bytes]],
):
try:
async with self.session.post(url, headers=headers, data=body) as resp:
if resp.status != 200:
logger.error(f"LMNT API failed: {resp.status} {await resp.text()}")
raise Exception(f"LMNT API returned {resp.status} status code")

data = await resp.json()
audio = base64.b64decode(data["audio"])
chunk_queue.put_nowait(audio)

except asyncio.CancelledError:
pass
finally:
chunk_queue.put_nowait(None) # treated as sentinel

async def close(self):
await self.session.close()