Text-to-Speech WebSocket Examples
Complete examples for real-time TTS using WebSockets, including interruption and cancellation patterns for voice agents.
Why WebSockets for TTS?
- Lowest Latency: Sub-100ms time-to-first-audio
- Streaming: Start playing audio before text is fully processed
- Interactive: Send more text while audio is playing
- Interruption: Cancel mid-sentence for voice agent conversations
Quick Start
Basic WebSocket Connection
Codeconst ws = new WebSocket("wss://api.slng.ai/v1/tts/deepgram/aura:2"); ws.onopen = () => { // 1. Initialize session ws.send( JSON.stringify({ type: "init", config: { encoding: "linear16", sample_rate: 24000, }, }), ); // 2. Send text to convert ws.send( JSON.stringify({ type: "speak", text: "Hello! This is a WebSocket TTS example.", }), ); // 3. Flush to get remaining audio ws.send( JSON.stringify({ type: "flush", }), ); }; ws.onmessage = (event) => { if (event.data instanceof ArrayBuffer) { // Binary audio data - play it! playAudio(event.data); } else { // JSON control messages const message = JSON.parse(event.data); if (message.type === "ready") { console.log("Session ready:", message.session_id); } else if (message.type === "flushed") { console.log("All audio sent"); } } };
Complete Examples
JavaScript/TypeScript
Full Implementation with Audio Playback
Codeclass TTSWebSocket { private ws: WebSocket; private audioContext: AudioContext; private audioQueue: AudioBuffer[] = []; private isPlaying: boolean = false; constructor(provider: string, model: string, variant: string) { const url = `wss://api.slng.ai/v1/tts/${provider}/${model}:${variant}`; this.ws = new WebSocket(url); this.audioContext = new AudioContext(); this.ws.onopen = () => this.handleOpen(); this.ws.onmessage = (event) => this.handleMessage(event); this.ws.onerror = (error) => this.handleError(error); this.ws.onclose = () => this.handleClose(); } private handleOpen() { console.log("WebSocket connected"); // Initialize session this.send({ type: "init", config: { encoding: "linear16", sample_rate: 24000, }, }); } private async handleMessage(event: MessageEvent) { if (event.data instanceof ArrayBuffer) { // Binary audio data await this.handleAudioChunk(event.data); } else { // JSON control messages const message = JSON.parse(event.data); switch (message.type) { case "ready": console.log("Session ready:", message.session_id); break; case "flushed": console.log("Audio buffer flushed"); break; case "error": console.error("TTS Error:", message.code, message.message); break; } } } private async handleAudioChunk(audioData: ArrayBuffer) { try { const audioBuffer = await this.audioContext.decodeAudioData(audioData); this.audioQueue.push(audioBuffer); if (!this.isPlaying) { this.playNextChunk(); } } catch (error) { console.error("Audio decode error:", error); } } private playNextChunk() { if (this.audioQueue.length === 0) { this.isPlaying = false; return; } this.isPlaying = true; const audioBuffer = this.audioQueue.shift()!; const source = this.audioContext.createBufferSource(); source.buffer = audioBuffer; source.connect(this.audioContext.destination); source.onended = () => { this.playNextChunk(); }; source.start(0); } private handleError(error: Event) { console.error("WebSocket error:", error); } private handleClose() { console.log("WebSocket closed"); } // Public API speak(text: string, flush: boolean = false) { this.send({ type: "speak", text: text, flush: flush, }); } flush() { this.send({ type: "flush", }); } clear() { this.send({ type: "clear", }); this.audioQueue = []; } cancel() { this.send({ type: "cancel", }); this.audioQueue = []; } close() { this.ws.close(); } private send(message: any) { if (this.ws.readyState === WebSocket.OPEN) { this.ws.send(JSON.stringify(message)); } } } // Usage const tts = new TTSWebSocket("deepgram", "aura", "2"); // Speak some text tts.speak("Hello! How can I help you today?"); // Speak more text while previous is playing setTimeout(() => { tts.speak("I can answer questions and have conversations."); }, 1000); // Flush when done setTimeout(() => { tts.flush(); }, 2000);
Python
Complete WebSocket Implementation
Codeimport websocket import json import threading import queue import pyaudio class TTSWebSocket: def __init__(self, provider: str, model: str, variant: str, api_key: str): self.url = f"wss://api.slng.ai/v1/tts/{provider}/{model}:{variant}" self.api_key = api_key self.ws = None self.audio_queue = queue.Queue() self.is_playing = False # Initialize audio player self.audio = pyaudio.PyAudio() self.stream = self.audio.open( format=pyaudio.paInt16, channels=1, rate=24000, output=True ) def on_open(self, ws): print("WebSocket connected") # Initialize session ws.send(json.dumps({ "type": "init", "config": { "encoding": "linear16", "sample_rate": 24000 } })) def on_message(self, ws, message): if isinstance(message, bytes): # Binary audio data self.audio_queue.put(message) if not self.is_playing: threading.Thread(target=self.play_audio, daemon=True).start() else: # JSON control messages data = json.loads(message) if data["type"] == "ready": print(f"Session ready: {data['session_id']}") elif data["type"] == "flushed": print("Audio buffer flushed") elif data["type"] == "error": print(f"TTS Error: {data['code']} - {data['message']}") def on_error(self, ws, error): print(f"WebSocket error: {error}") def on_close(self, ws, close_status_code, close_msg): print("WebSocket closed") def play_audio(self): self.is_playing = True while not self.audio_queue.empty(): audio_chunk = self.audio_queue.get() self.stream.write(audio_chunk) self.is_playing = False def connect(self): self.ws = websocket.WebSocketApp( self.url, on_open=self.on_open, on_message=self.on_message, on_error=self.on_error, on_close=self.on_close ) # Run in background thread threading.Thread(target=self.ws.run_forever, daemon=True).start() def speak(self, text: str, flush: bool = False): if self.ws and self.ws.sock and self.ws.sock.connected: self.ws.send(json.dumps({ "type": "speak", "text": text, "flush": flush })) def flush(self): if self.ws and self.ws.sock and self.ws.sock.connected: self.ws.send(json.dumps({"type": "flush"})) def clear(self): if self.ws and self.ws.sock and self.ws.sock.connected: self.ws.send(json.dumps({"type": "clear"})) # Clear local queue while not self.audio_queue.empty(): self.audio_queue.get() def cancel(self): if self.ws and self.ws.sock and self.ws.sock.connected: self.ws.send(json.dumps({"type": "cancel"})) # Clear local queue while not self.audio_queue.empty(): self.audio_queue.get() def close(self): if self.ws: self.ws.close() self.stream.stop_stream() self.stream.close() self.audio.terminate() # Usage import time tts = TTSWebSocket('deepgram', 'aura', '2', 'YOUR_API_KEY') tts.connect() time.sleep(1) # Wait for connection tts.speak('Hello! How can I help you today?') time.sleep(2) tts.speak('I can answer questions and have conversations.') time.sleep(2) tts.flush() time.sleep(3) tts.close()
Interruption & Cancellation Patterns
Critical for voice agents - handle user interruptions gracefully.
Pattern 1: Immediate Interruption
Stop current speech immediately when user starts speaking:
Codeclass InterruptibleTTS { constructor() { this.tts = new WebSocket("wss://api.slng.ai/v1/tts/deepgram/aura:2"); this.isCurrentlySpeaking = false; } async handleUserInterruption() { if (this.isCurrentlySpeaking) { // 1. Cancel current TTS this.tts.send(JSON.stringify({ type: "cancel" })); // 2. Clear local audio buffer this.clearAudioQueue(); // 3. Stop current playback this.stopCurrentPlayback(); this.isCurrentlySpeaking = false; } } speak(text) { this.isCurrentlySpeaking = true; this.tts.send( JSON.stringify({ type: "speak", text: text, }), ); this.tts.send( JSON.stringify({ type: "flush", }), ); } clearAudioQueue() { // Clear any queued audio chunks this.audioQueue = []; } stopCurrentPlayback() { // Stop Web Audio API source if (this.currentSource) { this.currentSource.stop(); this.currentSource = null; } } } // Usage in voice agent const tts = new InterruptibleTTS(); // User starts speaking - interrupt immediately microphoneDetector.on("speech_start", () => { tts.handleUserInterruption(); }); // Agent response tts.speak("This is the agent speaking and can be interrupted...");
Pattern 2: Clear and Restart
Clear buffer and send new message:
Codefunction handleNewAgentResponse(responseText) { // 1. Clear any pending audio ws.send(JSON.stringify({ type: "clear" })); // 2. Send new response ws.send( JSON.stringify({ type: "speak", text: responseText, }), ); ws.send(JSON.stringify({ type: "flush" })); }
Pattern 3: Smart Interruption with Fade-Out
Fade out current audio before switching:
Codeclass SmartInterruptibleTTS { async interrupt(newText) { // 1. Fade out current audio await this.fadeOut(200); // 200ms fade // 2. Cancel server-side this.ws.send(JSON.stringify({ type: "cancel" })); // 3. Clear queue this.audioQueue = []; // 4. Wait brief moment await new Promise((resolve) => setTimeout(resolve, 100)); // 5. Start new speech this.speak(newText); } async fadeOut(duration) { const audioContext = this.audioContext; const gainNode = this.gainNode; gainNode.gain.linearRampToValueAtTime( 0, audioContext.currentTime + duration / 1000, ); await new Promise((resolve) => setTimeout(resolve, duration)); } }
Pattern 4: Voice Agent Conversation Loop
Complete pattern for conversational AI:
Codeclass VoiceAgent { constructor() { this.tts = new WebSocket("wss://api.slng.ai/v1/tts/deepgram/aura:2"); this.stt = new WebSocket("wss://api.slng.ai/v1/stt/deepgram/nova:2"); this.isAgentSpeaking = false; this.conversationActive = true; this.initializeTTS(); this.initializeSTT(); } initializeTTS() { this.tts.onopen = () => { this.tts.send( JSON.stringify({ type: "init", config: { encoding: "linear16", sample_rate: 24000 }, }), ); }; this.tts.onmessage = (event) => { if (event.data instanceof ArrayBuffer) { this.playAudio(event.data); } else { const msg = JSON.parse(event.data); if (msg.type === "flushed") { this.isAgentSpeaking = false; } } }; } initializeSTT() { this.stt.onopen = () => { this.stt.send( JSON.stringify({ type: "init", config: { language: "en", sample_rate: 16000 }, }), ); }; this.stt.onmessage = (event) => { const msg = JSON.parse(event.data); if (msg.type === "transcript" && msg.is_final) { // User spoke - interrupt agent if speaking if (this.isAgentSpeaking) { this.interruptAgent(); } // Process user input this.handleUserInput(msg.text); } }; } interruptAgent() { // Cancel current TTS this.tts.send(JSON.stringify({ type: "cancel" })); this.clearAudioBuffer(); this.isAgentSpeaking = false; } async handleUserInput(userText) { console.log("User said:", userText); // Get response from your LLM/logic const response = await this.generateResponse(userText); // Speak response this.speak(response); } speak(text) { this.isAgentSpeaking = true; this.tts.send( JSON.stringify({ type: "speak", text: text, }), ); this.tts.send( JSON.stringify({ type: "flush", }), ); } async generateResponse(userText) { // Your LLM call here return "I understand you said: " + userText; } playAudio(audioData) { // Your audio playback logic } clearAudioBuffer() { // Clear queued audio } } // Usage const agent = new VoiceAgent();
Advanced Patterns
Batch Text Streaming
Send multiple sentences for smoother speech:
Codeconst sentences = text.split(". "); sentences.forEach((sentence, i) => { ws.send( JSON.stringify({ type: "speak", text: sentence + (i < sentences.length - 1 ? "." : ""), flush: false, // Don't flush until end }), ); }); // Flush after all sentences ws.send(JSON.stringify({ type: "flush" }));
Reconnection with Exponential Backoff
Codeclass ResilientTTSWebSocket { constructor(provider, model, variant) { this.url = `wss://api.slng.ai/v1/tts/${provider}/${model}:${variant}`; this.backoff = 1000; this.maxBackoff = 30000; this.connect(); } connect() { this.ws = new WebSocket(this.url); this.ws.onopen = () => { console.log("Connected"); this.backoff = 1000; // Reset backoff on successful connection this.initialize(); }; this.ws.onclose = () => { console.log(`Reconnecting in ${this.backoff}ms...`); setTimeout(() => this.connect(), this.backoff); this.backoff = Math.min(this.backoff * 2, this.maxBackoff); }; this.ws.onerror = (error) => { console.error("WebSocket error:", error); }; this.ws.onmessage = (event) => { this.handleMessage(event); }; } initialize() { this.ws.send( JSON.stringify({ type: "init", config: { encoding: "linear16", sample_rate: 24000 }, }), ); } handleMessage(event) { // Your message handling } }
Control Messages Reference
Client → Server
| Message | Description | Parameters |
|---|---|---|
init | Initialize session | config: {encoding, sample_rate, voice?} |
speak | Send text to convert | text: string, flush?: boolean |
flush | Send remaining audio | None |
clear | Clear audio buffer | None |
cancel | Cancel generation | None |
Server → Client
| Message | Description | Data |
|---|---|---|
ready | Session initialized | session_id: string |
flushed | Buffer flushed | None |
error | Error occurred | code: string, message: string |
| Binary | Audio data | Raw PCM samples |
Common Issues & Solutions
Issue: Audio is choppy
Solutions:
- Implement proper audio buffering
- Use Web Audio API queue
- Increase buffer size
- Check network latency
Issue: Interruption not working
Solutions:
- Send
cancelmessage immediately - Clear local audio queue
- Stop current audio source
- Don't wait for server acknowledgment
Issue: Connection drops
Solutions:
- Implement reconnection logic
- Send periodic ping messages
- Monitor connection state
- Handle onclose event
Last modified on