Оффлайн мультиязычный OCR-агент на Python с EasyOCR и OpenCV в Colab

сентября 12, 2025 · 5 min

Что мы создаём

В этой инструкции показано, как собрать продвинутого OCR-агента в Google Colab, работающего оффлайн и использующего EasyOCR, OpenCV и Pillow. Агент включает пайплайн предобработки (CLAHE, удаление шума, повышение резкости, адаптивная бинаризация), фильтрацию по уверености, поиск шаблонов (email, URL, даты, телефоны), визуализацию с ограничивающими прямоугольниками, пакетную обработку и экспорт результатов в структурированном виде.

Установка в Colab

Установите необходимые пакеты и импортируйте модули для предобработки изображений, OCR и визуализации.

!pip install easyocr opencv-python pillow matplotlib

import easyocr
import cv2
import numpy as np
from PIL import Image, ImageEnhance, ImageFilter
import matplotlib.pyplot as plt
import os
import json
from typing import List, Dict, Tuple, Optional
import re
from google.colab import files
import io

Класс AdvancedOCRAgent

Основная реализация инкапсулирует инициализацию с мультиязычным EasyOCR, продвинутую предобработку, извлечение текста с фильтрацией по уверенности, визуализацию, интеллектуальный анализ текста (детекция шаблонов и подсказки по языку), пакетную обработку и функции экспорта.

class AdvancedOCRAgent:
   """
   Advanced OCR AI Agent with preprocessing, multi-language support,
   and intelligent text extraction capabilities.
   """
  
   def __init__(self, languages: List[str] = ['en'], gpu: bool = True):
       """Initialize OCR agent with specified languages."""
       print(" Initializing Advanced OCR Agent...")
       self.languages = languages
       self.reader = easyocr.Reader(languages, gpu=gpu)
       self.confidence_threshold = 0.5
       print(f" OCR Agent ready! Languages: {languages}")
  
   def upload_image(self) -> Optional[str]:
       """Upload image file through Colab interface."""
       print(" Upload your image file:")
       uploaded = files.upload()
       if uploaded:
           filename = list(uploaded.keys())[0]
           print(f" Uploaded: {filename}")
           return filename
       return None
  
   def preprocess_image(self, image: np.ndarray, enhance: bool = True) -> np.ndarray:
       """Advanced image preprocessing for better OCR accuracy."""
       if len(image.shape) == 3:
           gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
       else:
           gray = image.copy()
      
       if enhance:
           clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
           gray = clahe.apply(gray)
          
           gray = cv2.fastNlMeansDenoising(gray)
          
           kernel = np.array([[-1,-1,-1], [-1,9,-1], [-1,-1,-1]])
           gray = cv2.filter2D(gray, -1, kernel)
      
       binary = cv2.adaptiveThreshold(
           gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2
       )
      
       return binary
  
   def extract_text(self, image_path: str, preprocess: bool = True) -> Dict:
       """Extract text from image with advanced processing."""
       print(f" Processing image: {image_path}")
      
       image = cv2.imread(image_path)
       if image is None:
           raise ValueError(f"Could not load image: {image_path}")
      
       if preprocess:
           processed_image = self.preprocess_image(image)
       else:
           processed_image = image
      
       results = self.reader.readtext(processed_image)
      
       extracted_data = {
           'raw_results': results,
           'filtered_results': [],
           'full_text': '',
           'confidence_stats': {},
           'word_count': 0,
           'line_count': 0
       }
      
       high_confidence_text = []
       confidences = []
      
       for (bbox, text, confidence) in results:
           if confidence >= self.confidence_threshold:
               extracted_data['filtered_results'].append({
                   'text': text,
                   'confidence': confidence,
                   'bbox': bbox
               })
               high_confidence_text.append(text)
               confidences.append(confidence)
      
       extracted_data['full_text'] = ' '.join(high_confidence_text)
       extracted_data['word_count'] = len(extracted_data['full_text'].split())
       extracted_data['line_count'] = len(high_confidence_text)
      
       if confidences:
           extracted_data['confidence_stats'] = {
               'mean': np.mean(confidences),
               'min': np.min(confidences),
               'max': np.max(confidences),
               'std': np.std(confidences)
           }
      
       return extracted_data
  
   def visualize_results(self, image_path: str, results: Dict, show_bbox: bool = True):
       """Visualize OCR results with bounding boxes."""
       image = cv2.imread(image_path)
       image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
      
       plt.figure(figsize=(15, 10))
      
       if show_bbox:
           plt.subplot(2, 2, 1)
           img_with_boxes = image_rgb.copy()
          
           for item in results['filtered_results']:
               bbox = np.array(item['bbox']).astype(int)
               cv2.polylines(img_with_boxes, [bbox], True, (255, 0, 0), 2)
              
               x, y = bbox[0]
               cv2.putText(img_with_boxes, f"{item['confidence']:.2f}",
                          (x, y-10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 1)
          
           plt.imshow(img_with_boxes)
           plt.title("OCR Results with Bounding Boxes")
           plt.axis('off')
      
       plt.subplot(2, 2, 2)
       processed = self.preprocess_image(image)
       plt.imshow(processed, cmap='gray')
       plt.title("Preprocessed Image")
       plt.axis('off')
      
       plt.subplot(2, 2, 3)
       confidences = [item['confidence'] for item in results['filtered_results']]
       if confidences:
           plt.hist(confidences, bins=20, alpha=0.7, color='blue')
           plt.xlabel('Confidence Score')
           plt.ylabel('Frequency')
           plt.title('Confidence Score Distribution')
           plt.axvline(self.confidence_threshold, color='red', linestyle='--',
                      label=f'Threshold: {self.confidence_threshold}')
           plt.legend()
      
       plt.subplot(2, 2, 4)
       stats = results['confidence_stats']
       if stats:
           labels = ['Mean', 'Min', 'Max']
           values = [stats['mean'], stats['min'], stats['max']]
           plt.bar(labels, values, color=['green', 'red', 'blue'])
           plt.ylabel('Confidence Score')
           plt.title('Confidence Statistics')
           plt.ylim(0, 1)
      
       plt.tight_layout()
       plt.show()
  
   def smart_text_analysis(self, text: str) -> Dict:
       """Perform intelligent analysis of extracted text."""
       analysis = {
           'language_detection': 'unknown',
           'text_type': 'unknown',
           'key_info': {},
           'patterns': []
       }
      
       email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
       phone_pattern = r'(\+\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}'
       url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
       date_pattern = r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b'
      
       patterns = {
           'emails': re.findall(email_pattern, text, re.IGNORECASE),
           'phones': re.findall(phone_pattern, text),
           'urls': re.findall(url_pattern, text, re.IGNORECASE),
           'dates': re.findall(date_pattern, text)
       }
      
       analysis['patterns'] = {k: v for k, v in patterns.items() if v}
      
       if any(patterns.values()):
           if patterns.get('emails') or patterns.get('phones'):
               analysis['text_type'] = 'contact_info'
           elif patterns.get('urls'):
               analysis['text_type'] = 'web_content'
           elif patterns.get('dates'):
               analysis['text_type'] = 'document_with_dates'
      
       if re.search(r'[а-яё]', text.lower()):
           analysis['language_detection'] = 'russian'
       elif re.search(r'[àáâãäåæçèéêëìíîïñòóôõöøùúûüý]', text.lower()):
           analysis['language_detection'] = 'romance_language'
       elif re.search(r'[一-龯]', text):
           analysis['language_detection'] = 'chinese'
       elif re.search(r'[ひらがなカタカナ]', text):
           analysis['language_detection'] = 'japanese'
       elif re.search(r'[a-zA-Z]', text):
           analysis['language_detection'] = 'latin_based'
      
       return analysis
  
   def process_batch(self, image_folder: str) -> List[Dict]:
       """Process multiple images in batch."""
       results = []
       supported_formats = ('.png', '.jpg', '.jpeg', '.bmp', '.tiff')
      
       for filename in os.listdir(image_folder):
           if filename.lower().endswith(supported_formats):
               image_path = os.path.join(image_folder, filename)
               try:
                   result = self.extract_text(image_path)
                   result['filename'] = filename
                   results.append(result)
                   print(f" Processed: {filename}")
               except Exception as e:
                   print(f" Error processing {filename}: {str(e)}")
      
       return results
  
   def export_results(self, results: Dict, format: str = 'json') -> str:
       """Export results in specified format."""
       if format.lower() == 'json':
           output = json.dumps(results, indent=2, ensure_ascii=False)
           filename = 'ocr_results.json'
       elif format.lower() == 'txt':
           output = results['full_text']
           filename = 'extracted_text.txt'
       else:
           raise ValueError("Supported formats: 'json', 'txt'")
      
       with open(filename, 'w', encoding='utf-8') as f:
           f.write(output)
      
       print(f" Results exported to: {filename}")
       return filename

Демонстрация и использование

В демо-функции показан полный поток: инициализация агента, загрузка изображения в Colab, извлечение с предобработкой, отображение количества слов и строк, вывод средней уверенности, интеллектуальный анализ текста, визуализация ограничивающих рамок и экспорт результатов.

def demo_ocr_agent():
   """Demonstrate the OCR agent capabilities."""
   print(" Advanced OCR AI Agent Demo")
   print("=" * 50)
  
   ocr = AdvancedOCRAgent(languages=['en'], gpu=True)
  
   image_path = ocr.upload_image()
   if image_path:
       try:
           results = ocr.extract_text(image_path, preprocess=True)
          
           print("\n OCR Results:")
           print(f"Words detected: {results['word_count']}")
           print(f"Lines detected: {results['line_count']}")
           print(f"Average confidence: {results['confidence_stats'].get('mean', 0):.2f}")
          
           print("\n Extracted Text:")
           print("-" * 30)
           print(results['full_text'])
           print("-" * 30)
          
           analysis = ocr.smart_text_analysis(results['full_text'])
           print(f"\n Smart Analysis:")
           print(f"Detected text type: {analysis['text_type']}")
           print(f"Language hints: {analysis['language_detection']}")
           if analysis['patterns']:
               print(f"Found patterns: {list(analysis['patterns'].keys())}")
          
           ocr.visualize_results(image_path, results)
          
           ocr.export_results(results, 'json')
          
       except Exception as e:
           print(f" Error: {str(e)}")
   else:
       print("No image uploaded. Please try again.")


if __name__ == "__main__":
   demo_ocr_agent()

Рекомендации

Настраивайте confidence_threshold для вашей задачи: выше порога — более надёжные фрагменты, ниже — больше текста, но с шумом.
Для специфичных документов (счета, паспорта) добавьте дополнительные паттерны и парсеры.
Для больших наборов данных пользуйтесь process_batch и экспортируйте результаты в JSON/TXT для последующего анализа.
Эта конфигурация работает полностью оффлайн в Colab с поддержкой GPU, что удобно для конфиденциальных данных и автономных рабочих процессов.