Build an Agentic Voice AI That Understands, Plans, and Speaks Autonomously
'Tutorial shows how to assemble a real-time voice AI agent that transcribes, reasons, plans and speaks using Whisper and SpeechT5.'
Overview
This tutorial walks through building an agentic voice AI assistant that listens, understands, reasons across multiple steps, plans actions, and responds through natural speech in real time. The system integrates speech-to-text, intent and entity extraction, multi-step reasoning, and text-to-speech, using models like Whisper and SpeechT5. Follow the code samples to assemble a runnable pipeline and see a demo showcasing perception, reasoning, and execution working together.
Prerequisites and setup
Install the Python libraries required for speech recognition, synthesis, and model inference, then initialize the environment and suppress noisy warnings.
import subprocess
import sys
import json
import re
from datetime import datetime
from typing import Dict, List, Tuple, Any
def install_packages():
packages = ['transformers', 'torch', 'torchaudio', 'datasets', 'soundfile',
'librosa', 'IPython', 'numpy']
for pkg in packages:
subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', pkg])
print(" Initializing Agentic Voice AI...")
install_packages()
import torch
import soundfile as sf
import numpy as np
from transformers import (AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline,
SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan)
from IPython.display import Audio, display, HTML
import warnings
warnings.filterwarnings('ignore')Perception layer: transcribe and extract meaning
The perception layer converts audio to text and extracts intent, entities, and sentiment. The agent stores perceptions in memory for context-aware responses.
class VoiceAgent:
def __init__(self):
self.memory = []
self.context = {}
self.tools = {}
self.goals = []
def perceive(self, audio_input: str) -> Dict[str, Any]:
intent = self._extract_intent(audio_input)
entities = self._extract_entities(audio_input)
sentiment = self._analyze_sentiment(audio_input)
perception = {
'text': audio_input,
'intent': intent,
'entities': entities,
'sentiment': sentiment,
'timestamp': datetime.now().isoformat()
}
self.memory.append(perception)
return perception
def _extract_intent(self, text: str) -> str:
text_lower = text.lower()
intent_patterns = {
'create': ['create', 'make', 'generate', 'write'],
'search': ['search', 'find', 'look for', 'show me'],
'analyze': ['analyze', 'explain', 'understand', 'what is'],
'calculate': ['calculate', 'compute', 'how much', 'sum'],
'schedule': ['schedule', 'plan', 'set reminder', 'meeting'],
'translate': ['translate', 'say in', 'convert to'],
'summarize': ['summarize', 'brief', 'tldr', 'overview']
}
for intent, keywords in intent_patterns.items():
if any(kw in text_lower for kw in keywords):
return intent
return 'conversation'
def _extract_entities(self, text: str) -> Dict[str, List[str]]:
entities = {
'numbers': re.findall(r'\d+', text),
'dates': re.findall(r'\b\d{1,2}/\d{1,2}/\d{2,4}\b', text),
'times': re.findall(r'\b\d{1,2}:\d{2}\s*(?:am|pm)?\b', text.lower()),
'emails': re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)
}
return {k: v for k, v in entities.items() if v}
def _analyze_sentiment(self, text: str) -> str:
positive = ['good', 'great', 'excellent', 'happy', 'love', 'thank']
negative = ['bad', 'terrible', 'sad', 'hate', 'angry', 'problem']
text_lower = text.lower()
pos_count = sum(1 for word in positive if word in text_lower)
neg_count = sum(1 for word in negative if word in text_lower)
if pos_count > neg_count:
return 'positive'
elif neg_count > pos_count:
return 'negative'
return 'neutral'This code provides a simple but practical approach to detect user intent and extract entities and sentiment that will feed into the reasoning pipeline.
Reasoning and planning
The agent maps intents to goals, checks prerequisites, constructs multi-step plans, and computes confidence. Each plan is executed step by step.
def reason(self, perception: Dict) -> Dict[str, Any]:
intent = perception['intent']
reasoning = {
'goal': self._identify_goal(intent),
'prerequisites': self._check_prerequisites(intent),
'plan': self._create_plan(intent, perception['entities']),
'confidence': self._calculate_confidence(perception)
}
return reasoning
def act(self, reasoning: Dict) -> str:
plan = reasoning['plan']
results = []
for step in plan['steps']:
result = self._execute_step(step)
results.append(result)
response = self._generate_response(results, reasoning)
return response
def _identify_goal(self, intent: str) -> str:
goal_mapping = {
'create': 'Generate new content',
'search': 'Retrieve information',
'analyze': 'Understand and explain',
'calculate': 'Perform computation',
'schedule': 'Organize time-based tasks',
'translate': 'Convert between languages',
'summarize': 'Condense information'
}
return goal_mapping.get(intent, 'Assist user')
def _check_prerequisites(self, intent: str) -> List[str]:
prereqs = {
'search': ['internet access', 'search tool'],
'calculate': ['math processor'],
'translate': ['translation model'],
'schedule': ['calendar access']
}
return prereqs.get(intent, ['language understanding'])
def _create_plan(self, intent: str, entities: Dict) -> Dict:
plans = {
'create': {'steps': ['understand_requirements', 'generate_content', 'validate_output'], 'estimated_time': '10s'},
'analyze': {'steps': ['parse_input', 'analyze_components', 'synthesize_explanation'], 'estimated_time': '5s'},
'calculate': {'steps': ['extract_numbers', 'determine_operation', 'compute_result'], 'estimated_time': '2s'}
}
default_plan = {'steps': ['understand_query', 'process_information', 'formulate_response'], 'estimated_time': '3s'}
return plans.get(intent, default_plan)Confidence, execution and response generation
Helper functions compute a confidence score, execute planned steps (placeholder implementations here), and generate a natural language response that reflects the agent's reasoning and memory.
def _calculate_confidence(self, perception: Dict) -> float:
base_confidence = 0.7
if perception['entities']:
base_confidence += 0.15
if perception['sentiment'] != 'neutral':
base_confidence += 0.1
if len(perception['text'].split()) > 5:
base_confidence += 0.05
return min(base_confidence, 1.0)
def _execute_step(self, step: str) -> Dict:
return {'step': step, 'status': 'completed', 'output': f'Executed {step}'}
def _generate_response(self, results: List, reasoning: Dict) -> str:
intent = reasoning['goal']
confidence = reasoning['confidence']
prefix = "I understand you want to" if confidence > 0.8 else "I think you're asking me to"
response = f"{prefix} {intent.lower()}. "
if len(self.memory) > 1:
response += "Based on our conversation, "
response += f"I've analyzed your request and completed {len(results)} steps. "
return responseVoice input/output: Whisper and SpeechT5
This module loads speech models and exposes listen and speak methods. It uses Whisper for ASR and SpeechT5 (with HiFi-GAN vocoder) for synthesis.
class VoiceIO:
def __init__(self):
print("Loading voice models...")
device = "cuda:0" if torch.cuda.is_available() else "cpu"
self.stt_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)
self.tts_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
self.tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
self.speaker_embeddings = torch.randn(1, 512) * 0.1
print("✓ Voice I/O ready")
def listen(self, audio_path: str) -> str:
result = self.stt_pipe(audio_path)
return result['text']
def speak(self, text: str, output_path: str = "response.wav") -> Tuple[str, np.ndarray]:
inputs = self.tts_processor(text=text, return_tensors="pt")
speech = self.tts_model.generate_speech(inputs["input_ids"], self.speaker_embeddings, vocoder=self.vocoder)
sf.write(output_path, speech.numpy(), samplerate=16000)
return output_path, speech.numpy()Integration: the interactive assistant
Combine the agent and voice I/O into a single Assistant object that runs the pipeline: listen, perceive, reason, act, speak.
class AgenticVoiceAssistant:
def __init__(self):
self.agent = VoiceAgent()
self.voice_io = VoiceIO()
self.interaction_count = 0
def process_voice_input(self, audio_path: str) -> Dict:
text_input = self.voice_io.listen(audio_path)
perception = self.agent.perceive(text_input)
reasoning = self.agent.reason(perception)
response_text = self.agent.act(reasoning)
audio_path, audio_array = self.voice_io.speak(response_text)
self.interaction_count += 1
return {
'input_text': text_input,
'perception': perception,
'reasoning': reasoning,
'response_text': response_text,
'audio_path': audio_path,
'audio_array': audio_array
}Debugging and demo visualization
A helper prints a styled HTML block showing the agent's input, perception, reasoning and response to help visualize internal state during a demo.
def display_reasoning(self, result: Dict):
html = f"""
<div style='background: #1e1e1e; color: #fff; padding: 20px; border-radius: 10px; font-family: monospace;'>
<h2 style='color: #4CAF50;'> Agent Reasoning Process</h2>
<div><strong style='color: #2196F3;'> INPUT:</strong> {result['input_text']}</div>
<div><strong style='color: #FF9800;'> PERCEPTION:</strong>
<ul>
<li>Intent: {result['perception']['intent']}</li>
<li>Entities: {result['perception']['entities']}</li>
<li>Sentiment: {result['perception']['sentiment']}</li>
</ul>
</div>
<div><strong style='color: #9C27B0;'> REASONING:</strong>
<ul>
<li>Goal: {result['reasoning']['goal']}</li>
<li>Plan: {len(result['reasoning']['plan']['steps'])} steps</li>
<li>Confidence: {result['reasoning']['confidence']:.2%}</li>
</ul>
</div>
<div><strong style='color: #4CAF50;'> RESPONSE:</strong> {result['response_text']}</div>
</div>
"""
display(HTML(html))Running a demo
The provided run_agentic_demo function simulates a few scenarios, has the assistant synthesize the simulated inputs as audio, processes them, displays reasoning and plays responses. It highlights the assistant's capabilities: perception, intent recognition, planning, execution and spoken output.
def run_agentic_demo():
print("\n" + "="*70)
print(" AGENTIC VOICE AI ASSISTANT")
print("="*70 + "\n")
assistant = AgenticVoiceAssistant()
scenarios = [
"Create a summary of machine learning concepts",
"Calculate the sum of twenty five and thirty seven",
"Analyze the benefits of renewable energy"
]
for i, scenario_text in enumerate(scenarios, 1):
print(f"\n--- Scenario {i} ---")
print(f"Simulated Input: '{scenario_text}'")
audio_path, _ = assistant.voice_io.speak(scenario_text, f"input_{i}.wav")
result = assistant.process_voice_input(audio_path)
assistant.display_reasoning(result)
print("\n Playing agent's voice response...")
display(Audio(result['audio_array'], rate=16000))
print("\n" + "-"*70)
print(f"\n Completed {assistant.interaction_count} agentic interactions")
print("\n Key Agentic Capabilities Demonstrated:")
print(" • Autonomous perception and understanding")
print(" • Intent recognition and entity extraction")
print(" • Multi-step reasoning and planning")
print(" • Goal-driven action execution")
print(" • Natural language response generation")
print(" • Memory and context management")
if __name__ == "__main__":
run_agentic_demo()What this shows
The example ties perception, reasoning, and execution into a voice interface that can operate autonomously on multi-step instructions. The implementation uses practical building blocks suitable for experimentation and extension: swap or fine-tune models, connect external tools (search, calculators, calendars), and expand planning logic for more complex agent behaviors.
Сменить язык
Читать эту статью на русском