""" Image Processing and OCR Service. Handles: - Image preprocessing for better OCR results (grayscale, denoising, binarization) - PaddleOCR integration for text recognition - Handwriting region extraction from scanned documents Used by FileProcessor for image and PDF-to-image OCR workflows. """ import logging from typing import Optional, List, Dict, Any, Tuple import cv2 import numpy as np from PIL import Image from .file_processor_types import ProcessedRegion logger = logging.getLogger(__name__) class ImageProcessor: """ Image preprocessing and OCR for BreakPilot. Supports: - PaddleOCR for German handwriting and printed text - OpenCV-based preprocessing (denoising, CLAHE, adaptive binarization) - Handwriting region extraction for exam correction """ def __init__(self, ocr_lang: str = "de", use_gpu: bool = False): self.ocr_lang = ocr_lang self.use_gpu = use_gpu self._ocr_engine = None @property def ocr_engine(self): """Lazy-Loading des OCR-Engines.""" if self._ocr_engine is None: self._ocr_engine = self._init_ocr_engine() return self._ocr_engine def _init_ocr_engine(self): """Initialisiert PaddleOCR oder Fallback.""" try: from paddleocr import PaddleOCR return PaddleOCR( use_angle_cls=True, lang='german', use_gpu=self.use_gpu, show_log=False ) except ImportError: logger.warning("PaddleOCR nicht installiert - verwende Fallback") return None def preprocess_image(self, img: Image.Image) -> Image.Image: """ Vorverarbeitung des Bildes fuer bessere OCR-Ergebnisse. - Konvertierung zu Graustufen - Kontrastverstaerkung - Rauschunterdrueckung - Binarisierung """ # PIL zu OpenCV cv_img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR) # Zu Graustufen konvertieren gray = cv2.cvtColor(cv_img, cv2.COLOR_BGR2GRAY) # Rauschunterdrueckung denoised = cv2.fastNlMeansDenoising(gray, None, 10, 7, 21) # Kontrastverstaerkung (CLAHE) clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8)) enhanced = clahe.apply(denoised) # Adaptive Binarisierung binary = cv2.adaptiveThreshold( enhanced, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2 ) # Zurueck zu PIL return Image.fromarray(binary) def ocr_image(self, img: Image.Image) -> Dict[str, Any]: """ Fuehrt OCR auf einem Bild aus. Returns: Dict mit text, confidence und regions """ if self.ocr_engine is None: return { "text": "[OCR nicht verfuegbar - bitte PaddleOCR installieren]", "confidence": 0.0, "regions": [] } # PIL zu numpy array img_array = np.array(img) # Wenn Graustufen, zu RGB konvertieren (PaddleOCR erwartet RGB) if len(img_array.shape) == 2: img_array = cv2.cvtColor(img_array, cv2.COLOR_GRAY2RGB) # OCR ausfuehren result = self.ocr_engine.ocr(img_array, cls=True) if not result or not result[0]: return {"text": "", "confidence": 0.0, "regions": []} all_text = [] all_regions = [] total_confidence = 0.0 for line in result[0]: bbox_points = line[0] # [[x1,y1], [x2,y2], [x3,y3], [x4,y4]] text, confidence = line[1] # Bounding Box zu x1, y1, x2, y2 konvertieren x_coords = [p[0] for p in bbox_points] y_coords = [p[1] for p in bbox_points] bbox = ( int(min(x_coords)), int(min(y_coords)), int(max(x_coords)), int(max(y_coords)) ) all_text.append(text) all_regions.append(ProcessedRegion( text=text, confidence=confidence, bbox=bbox )) total_confidence += confidence avg_confidence = total_confidence / len(all_regions) if all_regions else 0.0 return { "text": "\n".join(all_text), "confidence": avg_confidence, "regions": all_regions } def extract_handwriting_regions( self, img: Image.Image, min_area: int = 500 ) -> List[Dict[str, Any]]: """ Erkennt und extrahiert handschriftliche Bereiche aus einem Bild. Nuetzlich fuer Klausuren mit gedruckten Fragen und handschriftlichen Antworten. Args: img: Eingabebild min_area: Minimale Flaeche fuer erkannte Regionen Returns: Liste von Regionen mit Koordinaten und erkanntem Text """ # Bildvorverarbeitung cv_img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR) gray = cv2.cvtColor(cv_img, cv2.COLOR_BGR2GRAY) # Kanten erkennen edges = cv2.Canny(gray, 50, 150) # Morphologische Operationen zum Verbinden kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (15, 5)) dilated = cv2.dilate(edges, kernel, iterations=2) # Konturen finden contours, _ = cv2.findContours( dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE ) regions = [] for contour in contours: area = cv2.contourArea(contour) if area < min_area: continue x, y, w, h = cv2.boundingRect(contour) # Region ausschneiden region_img = img.crop((x, y, x + w, y + h)) # OCR auf Region anwenden ocr_result = self.ocr_image(region_img) regions.append({ "bbox": (x, y, x + w, y + h), "area": area, "text": ocr_result["text"], "confidence": ocr_result["confidence"] }) # Nach Y-Position sortieren (oben nach unten) regions.sort(key=lambda r: r["bbox"][1]) return regions