Build an Advanced WhisperX Voice AI Pipeline for Transcription, Alignment, Analysis and Export

October 3, 2025 · 7 min

Dependencies and quick setup

Start by installing WhisperX and a few helper libraries, then import the modules and set a simple CONFIG that detects CUDA and selects compute type, model size and batch size.

!pip install -q git+https://github.com/m-bain/whisperX.git
!pip install -q pandas matplotlib seaborn


import whisperx
import torch
import gc
import os
import json
import pandas as pd
from pathlib import Path
from IPython.display import Audio, display, HTML
import warnings
warnings.filterwarnings('ignore')


CONFIG = {
   "device": "cuda" if torch.cuda.is_available() else "cpu",
   "compute_type": "float16" if torch.cuda.is_available() else "int8",
   "batch_size": 16, 
   "model_size": "base", 
   "language": None, 
}


print(f" Running on: {CONFIG['device']}")
print(f" Compute type: {CONFIG['compute_type']}")
print(f" Model: {CONFIG['model_size']}")

This initializes the environment in a way that works both on GPU-enabled Colab and CPU-only machines.

Downloading and loading an example audio

Use a helper to fetch a sample file and another to load and show basic info about the audio (duration, sample rate) while playing it inline in notebooks.

def download_sample_audio():
   """Download a sample audio file for testing"""
   !wget -q -O sample.mp3 https://github.com/mozilla-extensions/speaktome/raw/master/content/cv-valid-dev/sample-000000.mp3
   print(" Sample audio downloaded")
   return "sample.mp3"


def load_and_analyze_audio(audio_path):
   """Load audio and display basic info"""
   audio = whisperx.load_audio(audio_path)
   duration = len(audio) / 16000 
   print(f" Audio: {Path(audio_path).name}")
   print(f"  Duration: {duration:.2f} seconds")
   print(f" Sample rate: 16000 Hz")
   display(Audio(audio_path))
   return audio, duration

Transcription with WhisperX (batched inference)

Load a WhisperX model, transcribe audio in batches and return a structured result that contains segments and detected language. The function also prints summary stats and clears model memory when finished.

def transcribe_audio(audio, model_size=CONFIG["model_size"], language=None):
   """Transcribe audio using WhisperX (batched inference)"""
   print("\n STEP 1: Transcribing audio...")
  
   model = whisperx.load_model(
       model_size,
       CONFIG["device"],
       compute_type=CONFIG["compute_type"]
   )
  
   transcribe_kwargs = {
       "batch_size": CONFIG["batch_size"]
   }
   if language:
       transcribe_kwargs["language"] = language
  
   result = model.transcribe(audio, **transcribe_kwargs)
  
   total_segments = len(result["segments"])
   total_words = sum(len(seg.get("words", [])) for seg in result["segments"])
  
   del model
   gc.collect()
   if CONFIG["device"] == "cuda":
       torch.cuda.empty_cache()
  
   print(f" Transcription complete!")
   print(f"   Language: {result['language']}")
   print(f"   Segments: {total_segments}")
   print(f"   Total text length: {sum(len(seg['text']) for seg in result['segments'])} characters")
  
   return result

This step gives you segment-level transcriptions and prepares them for alignment if you need word-level timestamps.

Aligning transcripts to get word-level timestamps

Load the alignment model and run alignment to produce precise word start/end times. The routine is defensive: on errors it falls back to segment-level timestamps and continues.

def align_transcription(segments, audio, language_code):
   """Align transcription for accurate word-level timestamps"""
   print("\n STEP 2: Aligning for word-level timestamps...")
  
   try:
       model_a, metadata = whisperx.load_align_model(
           language_code=language_code,
           device=CONFIG["device"]
       )
      
       result = whisperx.align(
           segments,
           model_a,
           metadata,
           audio,
           CONFIG["device"],
           return_char_alignments=False
       )
      
       total_words = sum(len(seg.get("words", [])) for seg in result["segments"])
      
       del model_a
       gc.collect()
       if CONFIG["device"] == "cuda":
           torch.cuda.empty_cache()
      
       print(f" Alignment complete!")
       print(f"   Aligned words: {total_words}")
      
       return result
   except Exception as e:
       print(f"  Alignment failed: {str(e)}")
       print("   Continuing with segment-level timestamps only...")
       return {"segments": segments, "word_segments": []}

Use alignment to refine timing for downstream tasks like subtitles, forced-alignment visualizations, or precise audio indexing.

Analyzing the transcription

Compute simple statistics: total duration, counts of segments/words/characters, words per minute, pause durations, and average word duration.

def analyze_transcription(result):
   """Generate statistics about the transcription"""
   print("\n TRANSCRIPTION STATISTICS")
   print("="*70)
  
   segments = result["segments"]
  
   total_duration = max(seg["end"] for seg in segments) if segments else 0
   total_words = sum(len(seg.get("words", [])) for seg in segments)
   total_chars = sum(len(seg["text"].strip()) for seg in segments)
  
   print(f"Total duration: {total_duration:.2f} seconds")
   print(f"Total segments: {len(segments)}")
   print(f"Total words: {total_words}")
   print(f"Total characters: {total_chars}")
  
   if total_duration > 0:
       print(f"Words per minute: {(total_words / total_duration * 60):.1f}")
  
   pauses = []
   for i in range(len(segments) - 1):
       pause = segments[i+1]["start"] - segments[i]["end"]
       if pause > 0:
           pauses.append(pause)
  
   if pauses:
       print(f"Average pause between segments: {sum(pauses)/len(pauses):.2f}s")
       print(f"Longest pause: {max(pauses):.2f}s")
  
   word_durations = []
   for seg in segments:
       if "words" in seg:
           for word in seg["words"]:
               duration = word["end"] - word["start"]
               word_durations.append(duration)
  
   if word_durations:
       print(f"Average word duration: {sum(word_durations)/len(word_durations):.3f}s")
  
   print("="*70)

These stats help you understand speaking rate, pauses and micro-timings that may indicate edits or stutters.

Presenting, exporting and formatting results

A display helper builds a DataFrame for segments or word-level rows, then shows it in notebooks. Export helpers write JSON, SRT, VTT, TXT and CSV files. Utility formatters convert seconds to SRT/VTT timestamps.

def display_results(result, show_words=False, max_rows=50):
   """Display transcription results in formatted table"""
   data = []
  
   for seg in result["segments"]:
       text = seg["text"].strip()
       start = f"{seg['start']:.2f}s"
       end = f"{seg['end']:.2f}s"
       duration = f"{seg['end'] - seg['start']:.2f}s"
      
       if show_words and "words" in seg:
           for word in seg["words"]:
               data.append({
                   "Start": f"{word['start']:.2f}s",
                   "End": f"{word['end']:.2f}s",
                   "Duration": f"{word['end'] - word['start']:.3f}s",
                   "Text": word["word"],
                   "Score": f"{word.get('score', 0):.2f}"
               })
       else:
           data.append({
               "Start": start,
               "End": end,
               "Duration": duration,
               "Text": text
           })
  
   df = pd.DataFrame(data)
  
   if len(df) > max_rows:
       print(f"Showing first {max_rows} rows of {len(df)} total...")
       display(HTML(df.head(max_rows).to_html(index=False)))
   else:
       display(HTML(df.to_html(index=False)))
  
   return df



def export_results(result, output_dir="output", filename="transcript"):
   """Export results in multiple formats"""
   os.makedirs(output_dir, exist_ok=True)
  
   json_path = f"{output_dir}/{filename}.json"
   with open(json_path, "w", encoding="utf-8") as f:
       json.dump(result, f, indent=2, ensure_ascii=False)
  
   srt_path = f"{output_dir}/{filename}.srt"
   with open(srt_path, "w", encoding="utf-8") as f:
       for i, seg in enumerate(result["segments"], 1):
           start = format_timestamp(seg["start"])
           end = format_timestamp(seg["end"])
           f.write(f"{i}\n{start} --> {end}\n{seg['text'].strip()}\n\n")
  
   vtt_path = f"{output_dir}/{filename}.vtt"
   with open(vtt_path, "w", encoding="utf-8") as f:
       f.write("WEBVTT\n\n")
       for i, seg in enumerate(result["segments"], 1):
           start = format_timestamp_vtt(seg["start"])
           end = format_timestamp_vtt(seg["end"])
           f.write(f"{start} --> {end}\n{seg['text'].strip()}\n\n")
  
   txt_path = f"{output_dir}/{filename}.txt"
   with open(txt_path, "w", encoding="utf-8") as f:
       for seg in result["segments"]:
           f.write(f"{seg['text'].strip()}\n")
  
   csv_path = f"{output_dir}/{filename}.csv"
   df_data = []
   for seg in result["segments"]:
       df_data.append({
           "start": seg["start"],
           "end": seg["end"],
           "text": seg["text"].strip()
       })
   pd.DataFrame(df_data).to_csv(csv_path, index=False)
  
   print(f"\n Results exported to '{output_dir}/' directory:")
   print(f"   ✓ {filename}.json (full structured data)")
   print(f"   ✓ {filename}.srt (subtitles)")
   print(f"   ✓ {filename}.vtt (web video subtitles)")
   print(f"   ✓ {filename}.txt (plain text)")
   print(f"   ✓ {filename}.csv (timestamps + text)")



def format_timestamp(seconds):
   """Convert seconds to SRT timestamp format"""
   hours = int(seconds // 3600)
   minutes = int((seconds % 3600) // 60)
   secs = int(seconds % 60)
   millis = int((seconds % 1) * 1000)
   return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}"



def format_timestamp_vtt(seconds):
   """Convert seconds to VTT timestamp format"""
   hours = int(seconds // 3600)
   minutes = int((seconds % 3600) // 60)
   secs = int(seconds % 60)
   millis = int((seconds % 1) * 1000)
   return f"{hours:02d}:{minutes:02d}:{secs:02d}.{millis:03d}"

These helpers let you quickly create subtitle files or CSV exports useful for indexing or downstream processing.

Batch processing and keyword extraction

You can process multiple files in a loop and export each result. A simple keyword extractor uses basic tokenization and stop-word filtering to surface the most common words.

def batch_process_files(audio_files, output_dir="batch_output"):
   """Process multiple audio files in batch"""
   print(f"\n Batch processing {len(audio_files)} files...")
   results = {}
  
   for i, audio_path in enumerate(audio_files, 1):
       print(f"\n[{i}/{len(audio_files)}] Processing: {Path(audio_path).name}")
       try:
           result, _ = process_audio_file(audio_path, show_output=False)
           results[audio_path] = result
          
           filename = Path(audio_path).stem
           export_results(result, output_dir, filename)
       except Exception as e:
           print(f" Error processing {audio_path}: {str(e)}")
           results[audio_path] = None
  
   print(f"\n Batch processing complete! Processed {len(results)} files.")
   return results



def extract_keywords(result, top_n=10):
   """Extract most common words from transcription"""
   from collections import Counter
   import re
  
   text = " ".join(seg["text"] for seg in result["segments"])
  
   words = re.findall(r'\b\w+\b', text.lower())
  
   stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
                 'of', 'with', 'is', 'was', 'are', 'were', 'be', 'been', 'being',
                 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could',
                 'should', 'may', 'might', 'must', 'can', 'this', 'that', 'these', 'those'}
  
   filtered_words = [w for w in words if w not in stop_words and len(w) > 2]
  
   word_counts = Counter(filtered_words).most_common(top_n)
  
   print(f"\n Top {top_n} Keywords:")
   for word, count in word_counts:
       print(f"   {word}: {count}")
  
   return word_counts

These utilities make it easy to scale to multiple files and quickly glean frequent terms from long recordings.

Full pipeline helper and usage examples

A single function ties everything together: load audio, transcribe, align, analyze, display and export. You can call it on a sample file or your own audio, switch to a larger model for improved accuracy, or toggle batch mode.

def process_audio_file(audio_path, show_output=True, analyze=True):
   """Complete WhisperX pipeline"""
   if show_output:
       print("="*70)
       print(" WhisperX Advanced Tutorial")
       print("="*70)
  
   audio, duration = load_and_analyze_audio(audio_path)
  
   result = transcribe_audio(audio, CONFIG["model_size"], CONFIG["language"])
  
   aligned_result = align_transcription(
       result["segments"],
       audio,
       result["language"]
   )
  
   if analyze and show_output:
       analyze_transcription(aligned_result)
       extract_keywords(aligned_result)
  
   if show_output:
       print("\n" + "="*70)
       print(" TRANSCRIPTION RESULTS")
       print("="*70)
       df = display_results(aligned_result, show_words=False)
      
       export_results(aligned_result)
   else:
       df = None
  
   return aligned_result, df


# Example 1: Process sample audio
# audio_path = download_sample_audio()
# result, df = process_audio_file(audio_path)


# Example 2: Show word-level details
# result, df = process_audio_file(audio_path)
# word_df = display_results(result, show_words=True)


# Example 3: Process your own audio
# audio_path = "your_audio.wav"  # or .mp3, .m4a, etc.
# result, df = process_audio_file(audio_path)


# Example 4: Batch process multiple files
# audio_files = ["audio1.mp3", "audio2.wav", "audio3.m4a"]
# results = batch_process_files(audio_files)


# Example 5: Use a larger model for better accuracy
# CONFIG["model_size"] = "large-v2"
# result, df = process_audio_file("audio.mp3")


print("\n Setup complete! Uncomment examples above to run.")

This recipe provides a flexible, memory-conscious workflow for transcription, alignment, analysis, and export. Use the provided functions as building blocks for integration into data pipelines, subtitle generation, or audio analytics systems.