Building a Real-Time Voice Assistant

Name: GridStorm
Availability: InStock

End-to-end pipeline: microphone → noise reduction → Whisper → LLM response → text-to-speech output.

Advanced · 22 min read

End-to-End Voice Pipeline

Flow:

Microphone — Capture audio stream
VAD — Voice activity detection
Whisper — Speech to text
LLM — Generate response
TTS — Text to speech output

import sounddevice as sd
import numpy as np
import queue, threading
from faster_whisper import WhisperModel
from openai import OpenAI
import pyttsx3

whisper = WhisperModel("base", device="cpu", compute_type="int8")
openai_client = OpenAI()
tts = pyttsx3.init()
tts.setProperty('rate', 175)

SAMPLE_RATE = 16000
CHUNK_SIZE  = int(SAMPLE_RATE * 0.5)
SILENCE_THRESHOLD = 0.01
SILENCE_CHUNKS = 6  # 3 seconds

audio_queue = queue.Queue()
history = [{"role": "system", "content": "You are a helpful voice assistant. Keep responses under 3 sentences."}]

def capture():
    with sd.InputStream(samplerate=SAMPLE_RATE, channels=1, dtype='float32',
                        blocksize=CHUNK_SIZE, callback=lambda d, *_: audio_queue.put(d.copy())):
        print("🎤 Listening...")
        threading.Event().wait()

def process():
    buf, silent = [], 0
    while True:
        chunk = audio_queue.get()
        rms = np.sqrt(np.mean(chunk ** 2))
        if rms > SILENCE_THRESHOLD:
            buf.append(chunk); silent = 0
        elif buf:
            buf.append(chunk); silent += 1
            if silent >= SILENCE_CHUNKS:
                audio = np.concatenate(buf).flatten()
                segs, _ = whisper.transcribe(audio, vad_filter=True)
                text = " ".join(s.text.strip() for s in segs).strip()
                if text and len(text) > 2:
                    print(f"\n👤 You: {text}")
                    history.append({"role": "user", "content": text})
                    resp = openai_client.chat.completions.create(
                        model="gpt-4o-mini", messages=history, max_tokens=150
                    ).choices[0].message.content
                    history.append({"role": "assistant", "content": resp})
                    print(f"🤖 Assistant: {resp}")
                    tts.say(resp); tts.runAndWait()
                buf, silent = [], 0

threading.Thread(target=capture, daemon=True).start()
process()

Part of the Speech Recognition & LLM Engineering series on Tekivex. Browse all tutorials or explore our open-source products.