Migration guide: Speechmatics to AssemblyAI | AssemblyAI

This guide walks through the process of migrating from Speechmatics to AssemblyAI for streaming Speech-to-text.

Get started

Before we begin, make sure you have an AssemblyAI account and an API key. You can sign up for a free account and get your API key from your dashboard.

Side-by-side code comparison

Below is a side-by-side comparison of a basic Python code snippet to transcribe streaming audio by Speechmatics and AssemblyAI.

Speechmatics

AssemblyAI

1 import pyaudio
2 import websocket
3 import json
4 import threading
5 import time
6 
7 # --- Configuration ---
8 
9 YOUR_API_KEY = "YOUR-API-KEY" # Replace with your actual API key
10 
11 CONNECTION_PARAMS = {
12 "language": "en",
13 "enable_partials": True,
14 "max_delay": 2.0
15 }
16 API_ENDPOINT = "wss://eu2.rt.speechmatics.com/v2/en"
17 
18 # Audio Configuration
19 
20 FRAMES_PER_BUFFER = 1024 # Chunk size
21 SAMPLE_RATE = None # Will be set based on device capabilities
22 CHANNELS = 1
23 FORMAT = pyaudio.paFloat32 # Speechmatics uses float32 format
24 
25 # Global variables for audio stream and websocket
26 
27 audio = None
28 stream = None
29 ws_app = None
30 audio_thread = None
31 stop_event = threading.Event() # To signal the audio thread to stop
32 audio_seq_no = 0 # Track number of audio chunks sent
33 
34 # --- WebSocket Event Handlers ---
35 
36 def on_open(ws):
37 """Called when the WebSocket connection is established."""
38 print("WebSocket connection opened.")
39 print(f"Connected to: {API_ENDPOINT}")
40 
41     # Send StartRecognition message
42     start_message = {
43         "message": "StartRecognition",
44         "audio_format": {
45             "type": "raw",
46             "encoding": "pcm_f32le",
47             "sample_rate": SAMPLE_RATE
48         },
49         "transcription_config": {
50             "language": CONNECTION_PARAMS["language"],
51             "enable_partials": CONNECTION_PARAMS["enable_partials"],
52             "max_delay": CONNECTION_PARAMS["max_delay"]
53         }
54     }
55     ws.send(json.dumps(start_message))
56 
57 def on_message(ws, message):
58 global audio_seq_no
59 
60     try:
61         data = json.loads(message)
62         msg_type = data.get('message')
63 
64         if msg_type == "RecognitionStarted":
65             session_id = data.get('id')
66             print(f"\nSession began: ID={session_id}")
67 
68             # Start sending audio data in a separate thread
69             def stream_audio():
70                 global audio_seq_no, stream
71                 print("Starting audio streaming...")
72                 while not stop_event.is_set():
73                     try:
74                         audio_data = stream.read(FRAMES_PER_BUFFER, exception_on_overflow=False)
75                         # Send audio data as binary message
76                         ws.send(audio_data, websocket.ABNF.OPCODE_BINARY)
77                         audio_seq_no += 1
78                     except Exception as e:
79                         print(f"Error streaming audio: {e}")
80                         # If stream read fails, likely means it's closed, stop the loop
81                         break
82                 print("Audio streaming stopped.")
83 
84             global audio_thread
85             audio_thread = threading.Thread(target=stream_audio)
86             audio_thread.daemon = (
87                 True  # Allow main thread to exit even if this thread is running
88             )
89             audio_thread.start()
90 
91         elif msg_type == "AddPartialTranscript":
92             transcript = data.get('metadata', {}).get('transcript', '')
93             if transcript:
94                 print(f"\r{transcript}", end='')
95 
96         elif msg_type == "AddTranscript":
97             transcript = data.get('metadata', {}).get('transcript', '')
98             if transcript:
99                 # Clear previous line for final messages
100                 print('\r' + ' ' * 80 + '\r', end='')
101                 print(transcript)
102 
103         elif msg_type == "EndOfTranscript":
104             print("\nSession Terminated: Transcription complete")
105 
106         elif msg_type == "Error":
107             error_type = data.get('type')
108             reason = data.get('reason')
109             print(f"\nWebSocket Error: {error_type} - {reason}")
110             stop_event.set()
111 
112     except json.JSONDecodeError as e:
113         print(f"Error decoding message: {e}")
114     except Exception as e:
115         print(f"Error handling message: {e}")
116 
117 def on_error(ws, error):
118 """Called when a WebSocket error occurs."""
119 print(f"\nWebSocket Error: {error}") # Attempt to signal stop on error
120 stop_event.set()
121 
122 def on_close(ws, close_status_code, close_msg):
123 """Called when the WebSocket connection is closed."""
124 print(f"\nWebSocket Disconnected: Status={close_status_code}, Msg={close_msg}") # Ensure audio resources are released
125 global stream, audio
126 stop_event.set() # Signal audio thread just in case it's still running
127 
128     if stream:
129         if stream.is_active():
130             stream.stop_stream()
131         stream.close()
132         stream = None
133     if audio:
134         audio.terminate()
135         audio = None
136     # Try to join the audio thread to ensure clean exit
137     if audio_thread and audio_thread.is_alive():
138         audio_thread.join(timeout=1.0)
139 
140 # --- Main Execution ---
141 
142 def run():
143 global audio, stream, ws_app, SAMPLE_RATE
144 
145     # Initialize PyAudio
146     audio = pyaudio.PyAudio()
147 
148     # Get default input device (can alter to specify specific device)
149     default_device = audio.get_default_input_device_info()
150     device_index = default_device['index']
151     SAMPLE_RATE = int(audio.get_device_info_by_index(device_index)['defaultSampleRate'])
152 
153     print(f"Using microphone: {default_device['name']}")
154 
155     # Open microphone stream
156     try:
157         stream = audio.open(
158             input=True,
159             frames_per_buffer=FRAMES_PER_BUFFER,
160             channels=CHANNELS,
161             format=FORMAT,
162             rate=SAMPLE_RATE,
163             input_device_index=device_index
164         )
165         print("Microphone stream opened successfully.")
166         print("Speak into your microphone. Press Ctrl+C to stop.")
167     except Exception as e:
168         print(f"Error opening microphone stream: {e}")
169         if audio:
170             audio.terminate()
171         return  # Exit if microphone cannot be opened
172 
173     # Create WebSocketApp
174     ws_app = websocket.WebSocketApp(
175         API_ENDPOINT,
176         header={"Authorization": f"Bearer {YOUR_API_KEY}"},  # Speechmatics uses Bearer token
177         on_open=on_open,
178         on_message=on_message,
179         on_error=on_error,
180         on_close=on_close,
181     )
182 
183     # Run WebSocketApp in a separate thread to allow main thread to catch KeyboardInterrupt
184     ws_thread = threading.Thread(target=lambda: ws_app.run_forever(ping_interval=30, ping_timeout=10))
185     ws_thread.daemon = True
186     ws_thread.start()
187 
188     try:
189         # Keep main thread alive until interrupted
190         while ws_thread.is_alive():
191             time.sleep(0.1)
192     except KeyboardInterrupt:
193         print("\nCtrl+C received. Stopping...")
194         stop_event.set()  # Signal audio thread to stop
195 
196         # Send EndOfStream message to the server
197         if ws_app and ws_app.sock and ws_app.sock.connected:
198             try:
199                 end_message = {
200                     "message": "EndOfStream",
201                     "last_seq_no": audio_seq_no
202                 }
203                 print(f"Sending termination message: {json.dumps(end_message)}")
204                 ws_app.send(json.dumps(end_message))
205                 # Give a moment for messages to process before forceful close
206                 time.sleep(1)
207             except Exception as e:
208                 print(f"Error sending termination message: {e}")
209 
210         # Close the WebSocket connection (will trigger on_close)
211         if ws_app:
212             ws_app.close()
213 
214         # Wait for WebSocket thread to finish
215         ws_thread.join(timeout=2.0)
216 
217     except Exception as e:
218         print(f"\nAn unexpected error occurred: {e}")
219         stop_event.set()
220         if ws_app:
221             ws_app.close()
222         ws_thread.join(timeout=2.0)
223 
224     finally:
225         # Final cleanup (already handled in on_close, but good as a fallback)
226         if stream and stream.is_active():
227             stream.stop_stream()
228         if stream:
229             stream.close()
230         if audio:
231             audio.terminate()
232         print("Cleanup complete. Exiting.")
233 
234 if **name** == "**main**":
235 run()

Step 1: Install dependencies

Speechmatics

AssemblyAI

Install the required Python packages.

$ pip install websocket-client pyaudio

Step 2: Configure the API key

In this step, you’ll configure your API key to authenticate your requests.

Speechmatics

AssemblyAI

Navigate to API Keys in your account settings and copy your API key.

Speechmatics

AssemblyAI

Store your API key in a variable. Replace <YOUR_API_KEY> with your copied API key.

1 import pyaudio
2 import websocket
3 import json
4 import threading
5 import time
6 
7 YOUR_API_KEY = "YOUR-API-KEY"

Authenticate With A Temporary Token

Speechmatics

AssemblyAI

1 import requests
2 
3 def generate_temp_token(api_key, ttl=60):
4     """Generate a temporary authentication token that expires after the specified time."""
5     url = "https://mp.speechmatics.com/v1/api_keys?type=rt"
6     headers = {
7         "Content-Type": "application/json",
8         "Authorization": f"Bearer {api_key}"
9     }
10     payload = {
11         "ttl": ttl
12     }
13 
14     response = requests.post(url, json=payload, headers=headers)
15     data = response.json()
16     return data.get("key_value")

Token usage

Instead of authorizing your request with YOUR_API_KEY (via request header), use the temporary token generated by this function when establishing the WebSocket connection.

1   API_ENDPOINT= f"wss://eu2.rt.speechmatics.com/v2?jwt={generate_temp_token(api_key)}"
2   ws_app = websocket.WebSocketApp(
3     API_ENDPOINT,
4     on_open=on_open,
5     on_message=on_message,
6     on_error=on_error,
7     on_close=on_close,
8   )

Step 3: Set up audio configuration

Configure the audio settings for your microphone stream.

Speechmatics

AssemblyAI

1 import pyaudio
2 
3 # Audio Configuration
4 FRAMES_PER_BUFFER = 1024  # Chunk size
5 SAMPLE_RATE = None  # Will be set based on device capabilities
6 CHANNELS = 1
7 FORMAT = pyaudio.paFloat32  # Speechmatics uses float32 format
8 
9 # Global variables for audio stream and websocket
10 audio = None
11 stream = None
12 ws_app = None
13 audio_thread = None
14 stop_event = threading.Event()  # To signal the audio thread to stop
15 audio_seq_no = 0  # Track number of audio chunks sent
16 
17 def run():
18   global audio, stream, ws_app, SAMPLE_RATE
19 
20   # Initialize PyAudio
21   audio = pyaudio.PyAudio()
22 
23   # Get default input device (can alter to specify specific device)
24   default_device = audio.get_default_input_device_info()
25   device_index = default_device['index']
26   SAMPLE_RATE = int(audio.get_device_info_by_index(device_index)['defaultSampleRate'])
27 
28   print(f"Using microphone: {default_device['name']}")
29 
30   # Open microphone stream
31   try:
32       stream = audio.open(
33           input=True,
34           frames_per_buffer=FRAMES_PER_BUFFER,
35           channels=CHANNELS,
36           format=FORMAT,
37           rate=SAMPLE_RATE,
38           input_device_index=device_index
39       )
40       print("Microphone stream opened successfully.")
41       print("Speak into your microphone. Press Ctrl+C to stop.")
42   except Exception as e:
43       print(f"Error opening microphone stream: {e}")
44       if audio:
45           audio.terminate()
46       return  # Exit if microphone cannot be opened

Sample rate

Speechmatics recommends using a 16 kHz sample rate for speech audio. Anything higher will be downsampled server-side.

Audio data format

If you want to stream data from elsewhere, make sure that your audio data is in the following format:

Single-channel
PCM16 (default) or Mu-law encoding (see Specifying the encoding)
A sample rate that matches the value of the sample_rate parameter (16 kHz is recommended)
50 milliseconds of audio per message (larger chunk sizes are workable, but may result in latency fluctuations)

Step 4: Create event handlers

In this step, you’ll set up callback functions that handle the different events.

Create functions to handle the events from the real-time service.

Speechmatics

AssemblyAI

1 import json
2 
3 def on_open(ws):
4     """Called when the WebSocket connection is established."""
5     print("WebSocket connection opened.")
6     print(f"Connected to: {API_ENDPOINT}")
7 
8     # Send StartRecognition message
9     start_message = {
10         "message": "StartRecognition",
11         "audio_format": {
12             "type": "raw",
13             "encoding": "pcm_f32le",
14             "sample_rate": SAMPLE_RATE
15         },
16         "transcription_config": {
17             "language": CONNECTION_PARAMS["language"],
18             "enable_partials": CONNECTION_PARAMS["enable_partials"],
19             "max_delay": CONNECTION_PARAMS["max_delay"]
20         }
21     }
22     ws.send(json.dumps(start_message))
23 
24 def on_error(ws, error):
25     """Called when a WebSocket error occurs."""
26     print(f"\nWebSocket Error: {error}")
27     # Attempt to signal stop on error
28     stop_event.set()
29 
30 def on_close(ws, close_status_code, close_msg):
31     """Called when the WebSocket connection is closed."""
32     print(f"\nWebSocket Disconnected: Status={close_status_code}, Msg={close_msg}")
33     # Ensure audio resources are released
34     global stream, audio
35     stop_event.set()  # Signal audio thread just in case it's still running
36 
37     if stream:
38         if stream.is_active():
39             stream.stop_stream()
40         stream.close()
41         stream = None
42     if audio:
43         audio.terminate()
44         audio = None
45     # Try to join the audio thread to ensure clean exit
46     if audio_thread and audio_thread.is_alive():
47         audio_thread.join(timeout=1.0)

Connection configuration

Speechmatics requires a handshake where the connection configuration is specified before audio is streamed. AssemblyAI allows you to configure the connection via query parameters in the URL and start streaming audio immediately.

The Speechmatics handshake begins when on_open sends a StartRecognition message to configure the session. Audio streaming only starts after the RecognitionStarted message type is parsed and confirmed in the on_message callback.

Create another function to handle transcripts.

Speechmatics has separate partial (AddPartialTranscript) and final (AddTranscript) transcripts. The terminate session message is EndOfTranscript.

AssemblyAI instead uses a Turn object with a turn_is_formatted boolean flag to indicate finality. The terminate session message is Termination. For more on the Turn object, see Streaming Core concepts section.

Speechmatics

AssemblyAI

1 def on_message(ws, message):
2     global audio_seq_no
3 
4     try:
5         data = json.loads(message)
6         msg_type = data.get('message')
7 
8         if msg_type == "RecognitionStarted":
9             session_id = data.get('id')
10             print(f"\nSession began: ID={session_id}")
11 
12             # Start sending audio data in a separate thread
13             def stream_audio():
14                 global audio_seq_no, stream
15                 print("Starting audio streaming...")
16                 while not stop_event.is_set():
17                     try:
18                         audio_data = stream.read(FRAMES_PER_BUFFER, exception_on_overflow=False)
19                         # Send audio data as binary message
20                         ws.send(audio_data, websocket.ABNF.OPCODE_BINARY)
21                         audio_seq_no += 1
22                     except Exception as e:
23                         print(f"Error streaming audio: {e}")
24                         # If stream read fails, likely means it's closed, stop the loop
25                         break
26                 print("Audio streaming stopped.")
27 
28             global audio_thread
29             audio_thread = threading.Thread(target=stream_audio)
30             audio_thread.daemon = (
31                 True  # Allow main thread to exit even if this thread is running
32             )
33             audio_thread.start()
34 
35         elif msg_type == "AddPartialTranscript":
36             transcript = data.get('metadata', {}).get('transcript', '')
37             if transcript:
38                 print(f"\r{transcript}", end='')
39 
40         elif msg_type == "AddTranscript":
41             transcript = data.get('metadata', {}).get('transcript', '')
42             if transcript:
43                 # Clear previous line for final messages
44                 print('\r' + ' ' * 80 + '\r', end='')
45                 print(transcript)
46 
47         elif msg_type == "EndOfTranscript":
48             print("\nSession Terminated: Transcription complete")
49 
50         elif msg_type == "Error":
51             error_type = data.get('type')
52             reason = data.get('reason')
53             print(f"\nWebSocket Error: {error_type} - {reason}")
54             stop_event.set()
55 
56     except json.JSONDecodeError as e:
57         print(f"Error decoding message: {e}")
58     except Exception as e:
59         print(f"Error handling message: {e}")

Transcript message structure

Please note the difference in transcript message structure below:

1 # Speechmatics
2 {
3   "message": "AddPartialTranscript",
4   "metadata": {
5     "transcript": "hello world"
6   },
7   # Other transcript data...
8 }
9 
10 # AssemblyAI
11 {
12   "type": "Turn",
13   "transcript": "hello world",
14   "turn_is_formatted": false,
15   # Other transcript data...
16 }

Step 5: Connect and start transcription

To stream audio, establish a connection to the API via WebSockets.

Speechmatics

AssemblyAI

Create a WebSocket connection to the Realtime service.

1 def run():
2     global audio, stream, ws_app, SAMPLE_RATE
3     # Skipping audio/microphone setup code...
4 
5     # Create WebSocketApp
6     ws_app = websocket.WebSocketApp(
7         API_ENDPOINT,
8         header={"Authorization": f"Bearer {YOUR_API_KEY}"},  # Speechmatics uses Bearer token
9         on_open=on_open,
10         on_message=on_message,
11         on_error=on_error,
12         on_close=on_close,
13     )
14 
15     # Run WebSocketApp in a separate thread to allow main thread to catch KeyboardInterrupt
16     ws_thread = threading.Thread(target=lambda: ws_app.run_forever(ping_interval=30, ping_timeout=10))
17     ws_thread.daemon = True
18     ws_thread.start()

Authorization

Note that while both services use an Authorization header to authenticate the WebSocket connection, Speechmatics uses a Bearer prefix, while AssemblyAI does not.

Step 6: Close the connection

Keep the main thread alive until interrupted, handle keyboard interrupts and thrown exceptions, and clean up upon closing of the WebSocket connection.

Speechmatics

AssemblyAI

1 def run():
2     global audio, stream, ws_app, SAMPLE_RATE
3     # Skipping audio/microphone setup and WebSocket connection code...
4 
5     try:
6         # Keep main thread alive until interrupted
7         while ws_thread.is_alive():
8             time.sleep(0.1)
9     except KeyboardInterrupt:
10         print("\nCtrl+C received. Stopping...")
11         stop_event.set()  # Signal audio thread to stop
12 
13         # Send EndOfStream message to the server
14         if ws_app and ws_app.sock and ws_app.sock.connected:
15             try:
16                 end_message = {
17                     "message": "EndOfStream",
18                     "last_seq_no": audio_seq_no
19                 }
20                 print(f"Sending termination message: {json.dumps(end_message)}")
21                 ws_app.send(json.dumps(end_message))
22                 # Give a moment for messages to process before forceful close
23                 time.sleep(1)
24             except Exception as e:
25                 print(f"Error sending termination message: {e}")
26 
27         # Close the WebSocket connection (will trigger on_close)
28         if ws_app:
29             ws_app.close()
30 
31         # Wait for WebSocket thread to finish
32         ws_thread.join(timeout=2.0)
33 
34     except Exception as e:
35         print(f"\nAn unexpected error occurred: {e}")
36         stop_event.set()
37         if ws_app:
38             ws_app.close()
39         ws_thread.join(timeout=2.0)
40 
41     finally:
42         # Final cleanup (already handled in on_close, but good as a fallback)
43         if stream and stream.is_active():
44             stream.stop_stream()
45         if stream:
46             stream.close()
47         if audio:
48             audio.terminate()
49         print("Cleanup complete. Exiting.")

The connection will close automatically when you press Ctrl+C. In both cases, the on_close handler will clean up the audio resources.

Step 7: Execute the main function

Finally, run the main function to start the main execution.

Speechmatics

AssemblyAI

1 if __name__ == "__main__":
2     run()

Next steps

To learn more about both Streaming APIs, their key differences, and how to best migrate, see the following resources:

AssemblyAI

Speechmatics

Need some help?

If you get stuck or have any other questions, contact our support team at support@assemblyai.com or create a support ticket.