Build an Offline Multilingual OCR Agent in Colab with EasyOCR & OpenCV
What you’ll build
This tutorial walks through building an advanced, offline OCR AI agent in Google Colab. The agent combines EasyOCR, OpenCV and Pillow to deliver multilingual recognition with a preprocessing pipeline (CLAHE, denoising, sharpening, adaptive thresholding), confidence filtering, pattern detection (emails, URLs, dates, phones), visualization with bounding boxes, batch processing and structured exports.
Setup in Colab
Install the required packages and import the main modules to prepare the environment for image preprocessing, OCR and visualization.
!pip install easyocr opencv-python pillow matplotlib
import easyocr
import cv2
import numpy as np
from PIL import Image, ImageEnhance, ImageFilter
import matplotlib.pyplot as plt
import os
import json
from typing import List, Dict, Tuple, Optional
import re
from google.colab import files
import io
AdvancedOCRAgent class
The core implementation encapsulates initialization with multilingual EasyOCR, advanced preprocessing, text extraction with confidence filtering, visualization, smart text analysis (pattern detection and language hints), batch processing and export functions.
class AdvancedOCRAgent:
"""
Advanced OCR AI Agent with preprocessing, multi-language support,
and intelligent text extraction capabilities.
"""
def __init__(self, languages: List[str] = ['en'], gpu: bool = True):
"""Initialize OCR agent with specified languages."""
print(" Initializing Advanced OCR Agent...")
self.languages = languages
self.reader = easyocr.Reader(languages, gpu=gpu)
self.confidence_threshold = 0.5
print(f" OCR Agent ready! Languages: {languages}")
def upload_image(self) -> Optional[str]:
"""Upload image file through Colab interface."""
print(" Upload your image file:")
uploaded = files.upload()
if uploaded:
filename = list(uploaded.keys())[0]
print(f" Uploaded: {filename}")
return filename
return None
def preprocess_image(self, image: np.ndarray, enhance: bool = True) -> np.ndarray:
"""Advanced image preprocessing for better OCR accuracy."""
if len(image.shape) == 3:
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
else:
gray = image.copy()
if enhance:
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
gray = clahe.apply(gray)
gray = cv2.fastNlMeansDenoising(gray)
kernel = np.array([[-1,-1,-1], [-1,9,-1], [-1,-1,-1]])
gray = cv2.filter2D(gray, -1, kernel)
binary = cv2.adaptiveThreshold(
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2
)
return binary
def extract_text(self, image_path: str, preprocess: bool = True) -> Dict:
"""Extract text from image with advanced processing."""
print(f" Processing image: {image_path}")
image = cv2.imread(image_path)
if image is None:
raise ValueError(f"Could not load image: {image_path}")
if preprocess:
processed_image = self.preprocess_image(image)
else:
processed_image = image
results = self.reader.readtext(processed_image)
extracted_data = {
'raw_results': results,
'filtered_results': [],
'full_text': '',
'confidence_stats': {},
'word_count': 0,
'line_count': 0
}
high_confidence_text = []
confidences = []
for (bbox, text, confidence) in results:
if confidence >= self.confidence_threshold:
extracted_data['filtered_results'].append({
'text': text,
'confidence': confidence,
'bbox': bbox
})
high_confidence_text.append(text)
confidences.append(confidence)
extracted_data['full_text'] = ' '.join(high_confidence_text)
extracted_data['word_count'] = len(extracted_data['full_text'].split())
extracted_data['line_count'] = len(high_confidence_text)
if confidences:
extracted_data['confidence_stats'] = {
'mean': np.mean(confidences),
'min': np.min(confidences),
'max': np.max(confidences),
'std': np.std(confidences)
}
return extracted_data
def visualize_results(self, image_path: str, results: Dict, show_bbox: bool = True):
"""Visualize OCR results with bounding boxes."""
image = cv2.imread(image_path)
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
plt.figure(figsize=(15, 10))
if show_bbox:
plt.subplot(2, 2, 1)
img_with_boxes = image_rgb.copy()
for item in results['filtered_results']:
bbox = np.array(item['bbox']).astype(int)
cv2.polylines(img_with_boxes, [bbox], True, (255, 0, 0), 2)
x, y = bbox[0]
cv2.putText(img_with_boxes, f"{item['confidence']:.2f}",
(x, y-10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 1)
plt.imshow(img_with_boxes)
plt.title("OCR Results with Bounding Boxes")
plt.axis('off')
plt.subplot(2, 2, 2)
processed = self.preprocess_image(image)
plt.imshow(processed, cmap='gray')
plt.title("Preprocessed Image")
plt.axis('off')
plt.subplot(2, 2, 3)
confidences = [item['confidence'] for item in results['filtered_results']]
if confidences:
plt.hist(confidences, bins=20, alpha=0.7, color='blue')
plt.xlabel('Confidence Score')
plt.ylabel('Frequency')
plt.title('Confidence Score Distribution')
plt.axvline(self.confidence_threshold, color='red', linestyle='--',
label=f'Threshold: {self.confidence_threshold}')
plt.legend()
plt.subplot(2, 2, 4)
stats = results['confidence_stats']
if stats:
labels = ['Mean', 'Min', 'Max']
values = [stats['mean'], stats['min'], stats['max']]
plt.bar(labels, values, color=['green', 'red', 'blue'])
plt.ylabel('Confidence Score')
plt.title('Confidence Statistics')
plt.ylim(0, 1)
plt.tight_layout()
plt.show()
def smart_text_analysis(self, text: str) -> Dict:
"""Perform intelligent analysis of extracted text."""
analysis = {
'language_detection': 'unknown',
'text_type': 'unknown',
'key_info': {},
'patterns': []
}
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
phone_pattern = r'(\+\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}'
url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
date_pattern = r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b'
patterns = {
'emails': re.findall(email_pattern, text, re.IGNORECASE),
'phones': re.findall(phone_pattern, text),
'urls': re.findall(url_pattern, text, re.IGNORECASE),
'dates': re.findall(date_pattern, text)
}
analysis['patterns'] = {k: v for k, v in patterns.items() if v}
if any(patterns.values()):
if patterns.get('emails') or patterns.get('phones'):
analysis['text_type'] = 'contact_info'
elif patterns.get('urls'):
analysis['text_type'] = 'web_content'
elif patterns.get('dates'):
analysis['text_type'] = 'document_with_dates'
if re.search(r'[а-яё]', text.lower()):
analysis['language_detection'] = 'russian'
elif re.search(r'[àáâãäåæçèéêëìíîïñòóôõöøùúûüý]', text.lower()):
analysis['language_detection'] = 'romance_language'
elif re.search(r'[一-龯]', text):
analysis['language_detection'] = 'chinese'
elif re.search(r'[ひらがなカタカナ]', text):
analysis['language_detection'] = 'japanese'
elif re.search(r'[a-zA-Z]', text):
analysis['language_detection'] = 'latin_based'
return analysis
def process_batch(self, image_folder: str) -> List[Dict]:
"""Process multiple images in batch."""
results = []
supported_formats = ('.png', '.jpg', '.jpeg', '.bmp', '.tiff')
for filename in os.listdir(image_folder):
if filename.lower().endswith(supported_formats):
image_path = os.path.join(image_folder, filename)
try:
result = self.extract_text(image_path)
result['filename'] = filename
results.append(result)
print(f" Processed: {filename}")
except Exception as e:
print(f" Error processing {filename}: {str(e)}")
return results
def export_results(self, results: Dict, format: str = 'json') -> str:
"""Export results in specified format."""
if format.lower() == 'json':
output = json.dumps(results, indent=2, ensure_ascii=False)
filename = 'ocr_results.json'
elif format.lower() == 'txt':
output = results['full_text']
filename = 'extracted_text.txt'
else:
raise ValueError("Supported formats: 'json', 'txt'")
with open(filename, 'w', encoding='utf-8') as f:
f.write(output)
print(f" Results exported to: {filename}")
return filename
Demo and workflow
Include a demo function that shows the typical flow: initialize the agent with languages and GPU, upload an image (Colab), extract text with preprocessing, print counts and confidence stats, run smart analysis, visualize results and export them.
def demo_ocr_agent():
"""Demonstrate the OCR agent capabilities."""
print(" Advanced OCR AI Agent Demo")
print("=" * 50)
ocr = AdvancedOCRAgent(languages=['en'], gpu=True)
image_path = ocr.upload_image()
if image_path:
try:
results = ocr.extract_text(image_path, preprocess=True)
print("\n OCR Results:")
print(f"Words detected: {results['word_count']}")
print(f"Lines detected: {results['line_count']}")
print(f"Average confidence: {results['confidence_stats'].get('mean', 0):.2f}")
print("\n Extracted Text:")
print("-" * 30)
print(results['full_text'])
print("-" * 30)
analysis = ocr.smart_text_analysis(results['full_text'])
print(f"\n Smart Analysis:")
print(f"Detected text type: {analysis['text_type']}")
print(f"Language hints: {analysis['language_detection']}")
if analysis['patterns']:
print(f"Found patterns: {list(analysis['patterns'].keys())}")
ocr.visualize_results(image_path, results)
ocr.export_results(results, 'json')
except Exception as e:
print(f" Error: {str(e)}")
else:
print("No image uploaded. Please try again.")
if __name__ == "__main__":
demo_ocr_agent()
Practical tips and extensions
- Preprocessing matters: use CLAHE, denoising and sharpening for low-contrast or noisy scans.
- Tune the confidence_threshold to balance recall vs precision.
- Extend smart_text_analysis patterns to match domain-specific data (invoices, IDs).
- For large datasets, use process_batch to run folder-level extraction and export against JSON/TXT.
- This approach keeps everything local and GPU-accelerated in Colab, avoiding external OCR APIs while remaining modular for production extensions.