less latency idea #857

ANIL-KADURKA · 2024-07-18T17:26:36Z

ANIL-KADURKA
Jul 18, 2024

see let me tell u by default the endpointing is 10 seconds like as soon as user speaks if the text is ready it will give the text after 10 ms i am using STT fine cool i dont need any interim results mostly i dont want to use this utternace_end_ms=1000 like my first question i have is IF I DONT USE THE UTTERANCE_END_MS also why it is taking 1.2 seconds to give the output if is use also its giving me the same latency liek i jsut want the text comes after 10 ms i dont want anything else to be used then how much latency will be there like tell me coming latency going latency + trabscription latency . i want the STT to come below 1 like around 500 ms is that possible .what is the best possible number so that i can have a transcription . i dont care about the speaker jsut tell me what is the best latency number i can get by using all those parameter

// const { createClient, LiveTranscriptionEvents } = require('@deepgram/sdk');
// const { Buffer } = require('buffer');
// const EventEmitter = require('events');

// class TranscriptionService extends EventEmitter {
//   constructor() {
//     super();
//     console.log("Creating Deepgram client");
//     const deepgram = createClient(process.env.DEEPGRAM_API_KEY);
//     console.log("Deepgram client created");

//     this.dgConnection = deepgram.listen.live({
//       encoding: 'mulaw',
//       sample_rate: '8000',
//       model: 'nova-2',
//       punctuate: true,
//       interim_results: true,
//       endpointing: 10,
//       punctuate: true,
//       utterance_end_ms: 1000
//     });

//     this.finalResult = '';
//     this.speechFinal = false;

//     this.dgConnection.on(LiveTranscriptionEvents.Open, () => {
//       console.log('STT -> Deepgram connection opened');
//     });

//     this.dgConnection.on(LiveTranscriptionEvents.Transcript, (transcriptionEvent) => {
//       const alternatives = transcriptionEvent.channel?.alternatives;
//       let text = '';
//       if (alternatives) {
//         text = alternatives[0]?.transcript;
//       }

//       if (transcriptionEvent.type === 'UtteranceEnd') {
//         if (!this.speechFinal) {
//           console.log(`UtteranceEnd received before speechFinal, emitting the collected text: ${this.finalResult}`.yellow);
//           this.emit('transcription', this.finalResult);
//           return;
//         } else {
//           console.log('STT -> Speech was already final when UtteranceEnd received'.yellow);
//           return;
//         }
//       }

//       if (transcriptionEvent.is_final === true && text.trim().length > 0) {
//         this.finalResult += ` ${text}`;
//         if (transcriptionEvent.speech_final === true) {
//           this.speechFinal = true;
//           this.emit('transcription', this.finalResult);
//           this.finalResult = '';
//         } else {
//           this.speechFinal = false;
//         }
//       } else {
//         this.emit('utterance', text);
//       }
//     });

//     this.dgConnection.on(LiveTranscriptionEvents.Error, (error) => {
//       console.error('STT -> Deepgram error:');
//       // Implement retry or error handling logic here
//     });

//     this.dgConnection.on(LiveTranscriptionEvents.Warning, (warning) => {
//       console.warn('STT -> Deepgram warning:', warning);
//     });

//     this.dgConnection.on(LiveTranscriptionEvents.Metadata, (metadata) => {
//       console.log('STT -> Deepgram metadata:', metadata);
//     });

//     this.dgConnection.on(LiveTranscriptionEvents.Close, () => {
//       console.log('STT -> Deepgram connection closed'.yellow);
//       // Implement reconnect logic if necessary
//     });
//   }

//   /**
//    * Send the payload to Deepgram
//    * @param {String} payload A base64 MULAW/8000 audio stream
//    */
//   send(payload) {
//     try {
//       if (this.dgConnection.getReadyState() === 1) {
//         this.dgConnection.send(Buffer.from(payload, 'base64'));
//       } else {
//         console.error('Deepgram connection is not ready'.red);
//       }
//     } catch (error) {
//       console.error('Failed to send payload to Deepgram:', error.message);
//       this.emit('transcriptionError', error.message);
//     }
//   }
// }

// module.exports = { TranscriptionService };


const { createClient, LiveTranscriptionEvents } = require('@deepgram/sdk');
const { Buffer } = require('buffer');
const EventEmitter = require('events');

class TranscriptionService extends EventEmitter {
  constructor() {
    super();
    console.log("Creating Deepgram client");
    const deepgram = createClient(process.env.DEEPGRAM_API_KEY);
    console.log("Deepgram client created");

    this.dgConnection = deepgram.listen.live({
      encoding: 'mulaw',
      sample_rate: '8000',
      smart_format: true,
      model: 'nova-2-phonecall', // Consider using a model optimized for real-time speech
      punctuate: true,
      no_delay: true,
      endpointing: 10, // Adjust endpointing based on your needs (consider lower values for faster processing)
      // utterance_end_ms: 00,
    });

    this.finalResult = '';
    this.speechFinal = false;
    this.isProcessing = false; // Flag to track ongoing processing

    this.dgConnection.on(LiveTranscriptionEvents.Open, () => {
      console.log('STT -> Deepgram connection opened');
    });

    this.dgConnection.on(LiveTranscriptionEvents.Transcript, (transcriptionEvent) => {
      const alternatives = transcriptionEvent.channel?.alternatives;
      let text = '';
      if (alternatives) {
        text = alternatives[0]?.transcript;
      }

      if (transcriptionEvent.is_final === true && text.trim().length > 0) {
        this.finalResult += ` ${text}`;

        if (transcriptionEvent.speech_final === true) {
          this.speechFinal = true;
          if (!this.isProcessing) {
            this.isProcessing = true; // Mark processing started
            // Process the final transcript here (e.g., call your LLM service)
            this.emit('transcription', this.finalResult);
            this.emit({})
            this.finalResult = ''; // Reset for next transcript
            this.isProcessing = false; // Mark processing completed
          }
        }
      } else {
        // Handle interim results if needed (e.g., for visual cues)
      }
    });

    this.dgConnection.on(LiveTranscriptionEvents.Error, (error) => {
      console.error('STT -> Deepgram error:', error.message);
      // Implement retry or error handling logic here
    });

    this.dgConnection.on(LiveTranscriptionEvents.Warning, (warning) => {
      console.warn('STT -> Deepgram warning:', warning);
    });

    this.dgConnection.on(LiveTranscriptionEvents.Metadata, (metadata) => {
      console.log('STT -> Deepgram metadata:', metadata);
    });

    this.dgConnection.on(LiveTranscriptionEvents.Close, () => {
      console.log('STT -> Deepgram connection closed');
      // Implement reconnect logic if necessary
    });
  }

  /**
   * Send the payload to Deepgram
   * @param {String} payload A base64 MULAW/8000 audio stream
   */
  send(payload) {
    try {
      if (this.dgConnection.getReadyState() === 1) {
        this.dgConnection.send(Buffer.from(payload, 'base64'));
      } else {
        console.error('Deepgram connection is not ready');
      }
    } catch (error) {
      console.error('Failed to send payload to Deepgram:', error.message);
      this.emit('transcriptionError', error.message);
    }
  }
}

module.exports = { TranscriptionService };

the latency is 1.2 to 2 i want to make to 0.5 to 0.9 is it possible ignore .BEST LATENCY FIGURE PLEASE????

team-deepgram · 2024-07-18T17:26:49Z

team-deepgram
Jul 18, 2024
Maintainer

Thanks for asking your question about Deepgram! If you didn't already include it in your post, please be sure to add as much detail as possible so we can assist you efficiently, such as:

The request_id if you have a question about your requests or transcription responses.
The features you used or the full api.deepgram.com URL you sent your request to, including parameters.
Any code snippets you can share.

0 replies

nikolawhallon · 2024-07-26T16:24:38Z

nikolawhallon
Jul 26, 2024
Collaborator

You settings, namely endpointing=10 and not using utterance_end_ms or interim_results, should be the fastest way to achieve speech_final results, and the latency of those results ought to be sub-second (I often get as low as 200ms-300ms). Can you share how you are measuring the latency? I don't see code keeping track of the audio cursor (i.e. where you are in the audio stream in your payload section).

This is a guide for measuring latency: https://developers.deepgram.com/docs/measuring-streaming-latency it recommends only making these measurements on interim results, but it's actually also valid for final results if those results are also speech_final results (i.e. it is not valid on non-speech_final final results).

0 replies

ANIL-KADURKA · 2024-11-20T09:27:39Z

ANIL-KADURKA
Nov 20, 2024
Author

Hi Team
@nikolawhallon @jkroll-deepgram

I have a clear understanding of the is_final, speech_final, end pointing, and interim results concepts:

is_final = false: This represents an interim result, which we currently do not use.
is_final = true: This provides an accurate transcript, which we use for making an LLM call.
Upon receiving the speech_final and is_final events, we concatenate all is_final transcripts into an empty string. We then use this concatenated result for our LLM call, which works well for our needs.

About the Utterance End Event

If we do not receive the speech_final event (e.g., if the user pauses or remains silent), we rely on the utterance_end event. If the resulting string is not empty, we use this result for making an LLM call.

This logic helps us avoid noise while ensuring the LLM call is made.

Current Logic

We have two triggers for making an LLM call:

Endpointing: Using is_final = true with concatenated transcripts.
Utterance End Event: Using the final result when speech_final is missed.

One of these two triggers will always occur, so we are confident about our LLM call workflow.

Concern
The Deepgram team suggested that avoiding both interim results and the utterance_end event could provide the fastest final transcript. However, we need to retain the utterance_end event because it is crucial for handling edge cases like user silence.

I am exploring ways to eliminate interim results (is_final = false) while addressing one specific edge case:

We currently use interim results to build an interruption detection mechanism (e.g., ensuring the LLM stops speaking when the user starts speaking during a call).

Question
**Can we replace interim results (is_final = false) with the speech_started event for interruption detection?

My concern is that the speech_started event might trigger for small noises like “umm,” which could create issues.**

Additionally, how much latency improvement can we expect if we set interim_results = false?

For reference, we are following the implementation at Deepgram-Twilio Streaming Voice Agent.

I’d appreciate any suggestions or best practices to reduce latency while retaining functionality for interruption detection and maintaining reliable triggers for LLM calls.

Thank you!

CODE SNIPPET.

our endpointing is of 50 ms

can we remove interm results logic with the other logic

deepgram.addListener(LiveTranscriptionEvents.Transcript, (data) => {
const transcript = data.channel.alternatives[0].transcript;
if (transcript !== "") {
if (data.is_final) {
is_finals += transcript + " ";
if (data.speech_final) {
/* we are initaiting the logic if both the speech_final and is_final is true */
const utterance = is_finals;
is_finals = "";
console.warn(deepgram STT: [Speech Final] ${utterance});
mediaStream.llmStart = Date.now();
promptLLM(mediaStream, utterance);
} else {
console.warn(deepgram STT: [Is Final] ${transcript});
}
} else {

        console.warn(`deepgram STT: [Interim Result] ${transcript}`);
        if (mediaStream.speaking) {
          console.log("******AUDIO CLEARED******");
          const messageJSON = JSON.stringify({
            event: "clear",
            streamSid: mediaStream.streamSid,
          });
          mediaStream.speaking = false;
          Promise.all([
              Promise.resolve(mediaStream.connection.sendUTF(messageJSON)),
              Promise.resolve(mediaStream.TTSWebsocket.send(JSON.stringify({ type: 'Clear' })))
          ]).catch((err) => {
              console.error("Error while sending messages:", err);
          });
        }
      }
    }
  });

deepgram.addListener(LiveTranscriptionEvents.UtteranceEnd, () => {
/* we are initaiting the logic if an non empty utterance event is reached /
if (is_finals.length > 0) {
console.log("deepgram STT: [Utterance End]");
const utterance = is_finals + " ";
is_finals = "";
console.log(deepgram STT: [Speech Final] ${utterance});
mediaStream.llmStart = Date.now();
promptLLM(mediaStream, utterance);
}
else{
console.log("******* Empty utterance End event reached")
}
});

0 replies

nikolawhallon · 2024-11-21T18:09:50Z

nikolawhallon
Nov 21, 2024
Collaborator

The Deepgram team suggested that avoiding both interim results and the utterance_end event could provide the fastest final transcript. However, we need to retain the utterance_end event because it is crucial for handling edge cases like user silence.

This is very true, utterance_end does catch important edge-cases. I wanted to suggest maybe trying out Deepgram's new voice agent API in early access: https://deepgram.com/learn/introducing-ai-voice-agent-api it handles these conversational flow issues with innovative techniques under the hood and can greatly simplify things.. You can even bring your own LLM.

0 replies

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Deepgram

less latency idea #857

{{title}}

{{editor}}'s edit

{{editor}}'s edit

Replies: 4 comments

{{title}}

{{title}}

{{title}}

{{title}}

Select a reply

Deepgram

less latency idea #857

ANIL-KADURKA Jul 18, 2024

Replies: 4 comments

team-deepgram Jul 18, 2024 Maintainer

nikolawhallon Jul 26, 2024 Collaborator

ANIL-KADURKA Nov 20, 2024 Author

About the Utterance End Event

Current Logic

nikolawhallon Nov 21, 2024 Collaborator

ANIL-KADURKA
Jul 18, 2024

team-deepgram
Jul 18, 2024
Maintainer

nikolawhallon
Jul 26, 2024
Collaborator

ANIL-KADURKA
Nov 20, 2024
Author

nikolawhallon
Nov 21, 2024
Collaborator