Boost ASR Accuracy with SpeechBrain: Build a Denoise + Recognition Pipeline in Python

Overview

This tutorial demonstrates a compact, end-to-end workflow using SpeechBrain to generate speech with gTTS, add controlled noise, apply MetricGAN+ for enhancement, and run ASR to compare word error rates before and after denoising. The pipeline is reproducible and can be run on Colab or a local machine with the necessary Python packages.

Setup and imports

Install dependencies, set up paths and device, and import required libraries:

!pip -q install -U speechbrain gTTS jiwer pydub librosa soundfile torchaudio
!apt -qq install -y ffmpeg >/dev/null


import os, time, math, random, warnings, shutil, glob
warnings.filterwarnings("ignore")
import torch, torchaudio, numpy as np, librosa, soundfile as sf
from gtts import gTTS
from pydub import AudioSegment
from jiwer import wer
from pathlib import Path
from dataclasses import dataclass
from typing import List, Tuple
from IPython.display import Audio, display
from speechbrain.pretrained import EncoderDecoderASR, SpectralMaskEnhancement


root = Path("sb_demo"); root.mkdir(exist_ok=True)
sr = 16000
device = "cuda" if torch.cuda.is_available() else "cpu"

This prepares the Colab environment, defines a project folder, sample rate, and selects GPU when available.

Utility functions

Helpers synthesize speech via gTTS, inject Gaussian noise at a target SNR, preview audio, normalize text, and keep sample metadata in a dataclass:

def tts_to_wav(text: str, out_wav: str, lang="en"):
   mp3 = out_wav.replace(".wav", ".mp3")
   gTTS(text=text, lang=lang).save(mp3)
   a = AudioSegment.from_file(mp3, format="mp3").set_channels(1).set_frame_rate(sr)
   a.export(out_wav, format="wav")
   os.remove(mp3)


def add_noise(in_wav: str, snr_db: float, out_wav: str):
   y, _ = librosa.load(in_wav, sr=sr, mono=True)
   rms = np.sqrt(np.mean(y**2) + 1e-12)
   n = np.random.normal(0, 1, len(y))
   n = n / (np.sqrt(np.mean(n**2)+1e-12))
   target_n_rms = rms / (10**(snr_db/20))
   y_noisy = np.clip(y + n * target_n_rms, -1.0, 1.0)
   sf.write(out_wav, y_noisy, sr)


def play(title, path):
   print(f" {title}: {path}")
   display(Audio(path, rate=sr))


def clean_txt(s: str) -> str:
   return " ".join("".join(ch.lower() if ch.isalnum() or ch.isspace() else " " for ch in s).split())


@dataclass
class Sample:
   text: str
   clean_wav: str
   noisy_wav: str
   enhanced_wav: str

These utilities let you generate WAV files from text, produce noisy variants with precise SNR, preview audio, and normalize transcripts for fair WER measurement.

Generate samples and load pretrained models

Create a few short utterances, synthesize them, add noise, and load SpeechBrain’s ASR and MetricGAN+ enhancement models:

sentences = [
   "Artificial intelligence is transforming everyday life.",
   "Open source tools enable rapid research and innovation.",
   "SpeechBrain brings flexible speech pipelines to Python."
]
samples: List[Sample] = []
print(" Synthesizing short utterances with gTTS...")
for i, s in enumerate(sentences, 1):
   cw = str(root/f"clean_{i}.wav")
   nw = str(root/f"noisy_{i}.wav")
   ew = str(root/f"enhanced_{i}.wav")
   tts_to_wav(s, cw)
   add_noise(cw, snr_db=3.0 if i%2 else 0.0, out_wav=nw)
   samples.append(Sample(text=s, clean_wav=cw, noisy_wav=nw, enhanced_wav=ew))


play("Clean #1", samples[0].clean_wav)
play("Noisy #1", samples[0].noisy_wav)


print(" Loading pretrained models (this downloads once) ...")
asr = EncoderDecoderASR.from_hparams(
   source="speechbrain/asr-crdnn-rnnlm-librispeech",
   run_opts={"device": device},
   savedir=str(root/"pretrained_asr"),
)
enhancer = SpectralMaskEnhancement.from_hparams(
   source="speechbrain/metricgan-plus-voicebank",
   run_opts={"device": device},
   savedir=str(root/"pretrained_enh"),
)

This step results in clean, noisy, and placeholder enhanced file paths and downloads the pretrained models used for enhancement and recognition.

Enhance audio, transcribe, and evaluate

Define functions for enhancement, transcription, and WER evaluation, then run them over samples to collect results:

def enhance_file(in_wav: str, out_wav: str):
   sig = enhancer.enhance_file(in_wav) 
   if sig.dim() == 1: sig = sig.unsqueeze(0)
   torchaudio.save(out_wav, sig.cpu(), sr)


def transcribe(path: str) -> str:
   hyp = asr.transcribe_file(path)
   return clean_txt(hyp)


def eval_pair(ref_text: str, wav_path: str) -> Tuple[str, float]:
   hyp = transcribe(wav_path)
   return hyp, wer(clean_txt(ref_text), hyp)


print("\n Transcribing noisy vs enhanced (MetricGAN+)...")
rows = []
t0 = time.time()
for smp in samples:
   enhance_file(smp.noisy_wav, smp.enhanced_wav)
   hyp_noisy,  wer_noisy  = eval_pair(smp.text, smp.noisy_wav)
   hyp_enh,    wer_enh    = eval_pair(smp.text, smp.enhanced_wav)
   rows.append((smp.text, hyp_noisy, wer_noisy, hyp_enh, wer_enh))
t1 = time.time()

Then format and display results, batch decode, play an enhanced sample, and compute average WERs:

def fmt(x): return f"{x:.3f}" if isinstance(x, float) else x
print(f"\n Inference time: {t1 - t0:.2f}s on {device.upper()}")
print("\n# ---- Results (Noisy → Enhanced) ----")
for i, (ref, hN, wN, hE, wE) in enumerate(rows, 1):
   print(f"\nUtterance {i}")
   print("Ref:      ", ref)
   print("Noisy ASR:", hN)
   print("WER noisy:", fmt(wN))
   print("Enh ASR:  ", hE)
   print("WER enh:  ", fmt(wE))


print("\n Batch decoding (looping API):")
batch_files = [s.clean_wav for s in samples] + [s.noisy_wav for s in samples]
bt0 = time.time()
batch_hyps = [transcribe(p) for p in batch_files]
bt1 = time.time()
for p, h in zip(batch_files, batch_hyps):
   print(os.path.basename(p), "->", h[:80] + ("..." if len(h) > 80 else ""))
print(f" Batch elapsed: {bt1 - bt0:.2f}s")


play("Enhanced #1 (MetricGAN+)", samples[0].enhanced_wav)


avg_wn = sum(wN for _,_,wN,_,_ in rows) / len(rows)
avg_we = sum(wE for _,_,_,_,wE in rows) / len(rows)
print("\n Summary:")
print(f"Avg WER (Noisy):     {avg_wn:.3f}")
print(f"Avg WER (Enhanced):  {avg_we:.3f}")
print("Tip: Try different SNRs or longer texts, and switch device to GPU if available.")

Observations

Running this pipeline shows how speech enhancement upstream of ASR can reduce WER on noisy utterances. MetricGAN+ is a practical pretrained enhancement model that integrates easily with SpeechBrain’s ASR utilities. The example is compact, reproducible, and can be extended to longer datasets, different SNR settings, or alternative enhancement models.