Files
resolutionflow/.agent/skills/speech-to-text/references/realtime-server-side.md
2026-02-15 00:43:41 -05:00

8.2 KiB

Server-Side Real-Time Streaming

Transcribe audio streams in real-time from your server with ultra-low latency.

Installation

# Python
pip install elevenlabs python-dotenv pydub

# JavaScript
npm install @elevenlabs/elevenlabs-js dotenv

Warning: Do not use npm install elevenlabs - that's an outdated v1.x package. Always use @elevenlabs/elevenlabs-js.

Configuration

Store your API key in a .env file:

ELEVENLABS_API_KEY=<your_api_key_here>

Stream from URL

Python

from dotenv import load_dotenv
import os
import asyncio
from elevenlabs.client import ElevenLabs
from elevenlabs import RealtimeEvents, RealtimeUrlOptions

load_dotenv()

async def main():
    elevenlabs = ElevenLabs(api_key=os.getenv("ELEVENLABS_API_KEY"))
    stop_event = asyncio.Event()

    connection = await elevenlabs.speech_to_text.realtime.connect(RealtimeUrlOptions(
        model_id="scribe_v2_realtime",
        url="https://npr-ice.streamguys1.com/live.mp3",
        include_timestamps=True,
    ))

    def on_partial_transcript(data):
        print(f"Partial: {data.get('text', '')}")

    def on_committed_transcript(data):
        print(f"Committed: {data.get('text', '')}")

    def on_error(error):
        print(f"Error: {error}")
        stop_event.set()

    def on_close():
        print("Connection closed")

    connection.on(RealtimeEvents.PARTIAL_TRANSCRIPT, on_partial_transcript)
    connection.on(RealtimeEvents.COMMITTED_TRANSCRIPT, on_committed_transcript)
    connection.on(RealtimeEvents.ERROR, on_error)
    connection.on(RealtimeEvents.CLOSE, on_close)

    try:
        await stop_event.wait()
    except KeyboardInterrupt:
        print("\nStopping transcription...")
    finally:
        await connection.close()

if __name__ == "__main__":
    asyncio.run(main())

JavaScript

import "dotenv/config";
import { ElevenLabsClient, RealtimeEvents } from "@elevenlabs/elevenlabs-js";

const elevenlabs = new ElevenLabsClient();

const connection = await elevenlabs.speechToText.realtime.connect({
  modelId: "scribe_v2_realtime",
  url: "https://npr-ice.streamguys1.com/live.mp3",
  includeTimestamps: true,
});

connection.on(RealtimeEvents.PARTIAL_TRANSCRIPT, (transcript) => {
  console.log("Partial transcript", transcript);
});

connection.on(RealtimeEvents.COMMITTED_TRANSCRIPT, (transcript) => {
  console.log("Committed transcript", transcript);
});

connection.on(RealtimeEvents.ERROR, (error) => {
  console.log("Error", error);
});

connection.on(RealtimeEvents.CLOSE, () => {
  console.log("Connection closed");
});

Manual Audio Chunking

For local files or custom audio streams, convert to PCM format and send in chunks.

Python

import asyncio
import base64
import os
from dotenv import load_dotenv
from pathlib import Path
from elevenlabs.client import ElevenLabs
from elevenlabs import AudioFormat, CommitStrategy, RealtimeEvents, RealtimeAudioOptions
from pydub import AudioSegment

load_dotenv()

def load_and_convert_audio(audio_path: str | Path, target_sample_rate: int = 16000) -> bytes:
    if str(audio_path).lower().endswith('.pcm'):
        with open(audio_path, 'rb') as f:
            return f.read()

    audio = AudioSegment.from_file(audio_path)
    if audio.channels > 1:
        audio = audio.set_channels(1)
    if audio.frame_rate != target_sample_rate:
        audio = audio.set_frame_rate(target_sample_rate)
    audio = audio.set_sample_width(2)
    return audio.raw_data

async def main():
    elevenlabs = ElevenLabs(api_key=os.getenv("ELEVENLABS_API_KEY"))
    transcription_complete = asyncio.Event()

    connection = await elevenlabs.speech_to_text.realtime.connect(RealtimeAudioOptions(
        model_id="scribe_v2_realtime",
        audio_format=AudioFormat.PCM_16000,
        sample_rate=16000,
        commit_strategy=CommitStrategy.MANUAL,
        include_timestamps=True,
    ))

    def on_session_started(data):
        print(f"Session started: {data}")
        asyncio.create_task(send_audio())

    def on_partial_transcript(data):
        transcript = data.get('text', '')
        if transcript:
            print(f"Partial: {transcript}")

    def on_committed_transcript(data):
        transcript = data.get('text', '')
        print(f"\nCommitted transcript: {transcript}")

    def on_committed_transcript_with_timestamps(data):
        print(f"Timestamps: {data.get('words', '')}")
        transcription_complete.set()

    def on_error(error):
        print(f"Error: {error}")
        transcription_complete.set()

    def on_close():
        print("Connection closed")
        transcription_complete.set()

    connection.on(RealtimeEvents.SESSION_STARTED, on_session_started)
    connection.on(RealtimeEvents.PARTIAL_TRANSCRIPT, on_partial_transcript)
    connection.on(RealtimeEvents.COMMITTED_TRANSCRIPT, on_committed_transcript)
    connection.on(RealtimeEvents.COMMITTED_TRANSCRIPT_WITH_TIMESTAMPS, on_committed_transcript_with_timestamps)
    connection.on(RealtimeEvents.ERROR, on_error)
    connection.on(RealtimeEvents.CLOSE, on_close)

    async def send_audio():
        audio_file_path = Path("audio.mp3")
        audio_data = load_and_convert_audio(audio_file_path)
        chunk_size = 32000  # 1 second of audio at 16kHz
        chunks = [audio_data[i:i + chunk_size] for i in range(0, len(audio_data), chunk_size)]

        for i, chunk in enumerate(chunks):
            chunk_base64 = base64.b64encode(chunk).decode('utf-8')
            await connection.send({"audio_base_64": chunk_base64, "sample_rate": 16000})

            if i < len(chunks) - 1:
                await asyncio.sleep(1)

        await asyncio.sleep(0.5)
        await connection.commit()

    try:
        await transcription_complete.wait()
    except KeyboardInterrupt:
        print("\nStopping...")
    finally:
        await connection.close()

if __name__ == "__main__":
    asyncio.run(main())

JavaScript

import "dotenv/config";
import * as fs from "node:fs";
import { ElevenLabsClient, RealtimeEvents, AudioFormat } from "@elevenlabs/elevenlabs-js";

const elevenlabs = new ElevenLabsClient();

const connection = await elevenlabs.speechToText.realtime.connect({
  modelId: "scribe_v2_realtime",
  audioFormat: AudioFormat.PCM_16000,
  sampleRate: 16000,
  includeTimestamps: true,
});

connection.on(RealtimeEvents.SESSION_STARTED, (data) => {
  console.log("Session started", data);
  sendAudio();
});

connection.on(RealtimeEvents.PARTIAL_TRANSCRIPT, (transcript) => {
  console.log("Partial transcript", transcript);
});

connection.on(RealtimeEvents.COMMITTED_TRANSCRIPT, (transcript) => {
  console.log("Committed transcript", transcript);
});

connection.on(RealtimeEvents.COMMITTED_TRANSCRIPT_WITH_TIMESTAMPS, (transcript) => {
  console.log("Committed with timestamps", transcript);
});

connection.on(RealtimeEvents.ERROR, (error) => {
  console.log("Error", error);
});

connection.on(RealtimeEvents.CLOSE, () => {
  console.log("Connection closed");
});

async function sendAudio() {
  const pcmFilePath = "audio.pcm";
  const chunkSize = 32000;
  const audioBuffer = fs.readFileSync(pcmFilePath);
  const chunks: Buffer[] = [];

  for (let i = 0; i < audioBuffer.length; i += chunkSize) {
    const chunk = audioBuffer.subarray(i, i + chunkSize);
    chunks.push(chunk);
  }

  for (let i = 0; i < chunks.length; i++) {
    const chunk = chunks[i];
    const chunkBase64 = chunk.toString("base64");

    connection.send({
      audioBase64: chunkBase64,
      sampleRate: 16000,
    });

    if (i < chunks.length - 1) {
      await new Promise((resolve) => setTimeout(resolve, 1000));
    }
  }

  await new Promise((resolve) => setTimeout(resolve, 500));
  connection.commit();
}

Direct WebSocket Connection

For cases where the SDK cannot be used:

wss://api.elevenlabs.io/v1/speech-to-text/realtime?model_id=scribe_v2_realtime

Message Format

{
  "message_type": "input_audio_chunk",
  "audio_base_64": "<base64-encoded-audio>",
  "sample_rate": 16000
}

Commit Message

{
  "message_type": "commit"
}

Audio Requirements

Parameter Value
Format PCM 16-bit
Sample Rate 16000 Hz (recommended)
Channels Mono
Chunk Size 32,000 bytes = 1 second

Supported sample rates: 8kHz to 48kHz