breakpilot-core/backend-core/services/image_processing.py

"""
Image Processing and OCR Service.

Handles:
- Image preprocessing for better OCR results (grayscale, denoising, binarization)
- PaddleOCR integration for text recognition
- Handwriting region extraction from scanned documents

Used by FileProcessor for image and PDF-to-image OCR workflows.
"""

import logging
from typing import Optional, List, Dict, Any, Tuple

import cv2
import numpy as np
from PIL import Image

from .file_processor_types import ProcessedRegion

logger = logging.getLogger(__name__)


class ImageProcessor:
    """
    Image preprocessing and OCR for BreakPilot.

    Supports:
    - PaddleOCR for German handwriting and printed text
    - OpenCV-based preprocessing (denoising, CLAHE, adaptive binarization)
    - Handwriting region extraction for exam correction
    """

    def __init__(self, ocr_lang: str = "de", use_gpu: bool = False):
        self.ocr_lang = ocr_lang
        self.use_gpu = use_gpu
        self._ocr_engine = None

    @property
    def ocr_engine(self):
        """Lazy-Loading des OCR-Engines."""
        if self._ocr_engine is None:
            self._ocr_engine = self._init_ocr_engine()
        return self._ocr_engine

    def _init_ocr_engine(self):
        """Initialisiert PaddleOCR oder Fallback."""
        try:
            from paddleocr import PaddleOCR
            return PaddleOCR(
                use_angle_cls=True,
                lang='german',
                use_gpu=self.use_gpu,
                show_log=False
            )
        except ImportError:
            logger.warning("PaddleOCR nicht installiert - verwende Fallback")
            return None

    def preprocess_image(self, img: Image.Image) -> Image.Image:
        """
        Vorverarbeitung des Bildes fuer bessere OCR-Ergebnisse.

        - Konvertierung zu Graustufen
        - Kontrastverstaerkung
        - Rauschunterdrueckung
        - Binarisierung
        """
        # PIL zu OpenCV
        cv_img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)

        # Zu Graustufen konvertieren
        gray = cv2.cvtColor(cv_img, cv2.COLOR_BGR2GRAY)

        # Rauschunterdrueckung
        denoised = cv2.fastNlMeansDenoising(gray, None, 10, 7, 21)

        # Kontrastverstaerkung (CLAHE)
        clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
        enhanced = clahe.apply(denoised)

        # Adaptive Binarisierung
        binary = cv2.adaptiveThreshold(
            enhanced,
            255,
            cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
            cv2.THRESH_BINARY,
            11,
            2
        )

        # Zurueck zu PIL
        return Image.fromarray(binary)

    def ocr_image(self, img: Image.Image) -> Dict[str, Any]:
        """
        Fuehrt OCR auf einem Bild aus.

        Returns:
            Dict mit text, confidence und regions
        """
        if self.ocr_engine is None:
            return {
                "text": "[OCR nicht verfuegbar - bitte PaddleOCR installieren]",
                "confidence": 0.0,
                "regions": []
            }

        # PIL zu numpy array
        img_array = np.array(img)

        # Wenn Graustufen, zu RGB konvertieren (PaddleOCR erwartet RGB)
        if len(img_array.shape) == 2:
            img_array = cv2.cvtColor(img_array, cv2.COLOR_GRAY2RGB)

        # OCR ausfuehren
        result = self.ocr_engine.ocr(img_array, cls=True)

        if not result or not result[0]:
            return {"text": "", "confidence": 0.0, "regions": []}

        all_text = []
        all_regions = []
        total_confidence = 0.0

        for line in result[0]:
            bbox_points = line[0]  # [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
            text, confidence = line[1]

            # Bounding Box zu x1, y1, x2, y2 konvertieren
            x_coords = [p[0] for p in bbox_points]
            y_coords = [p[1] for p in bbox_points]
            bbox = (
                int(min(x_coords)),
                int(min(y_coords)),
                int(max(x_coords)),
                int(max(y_coords))
            )

            all_text.append(text)
            all_regions.append(ProcessedRegion(
                text=text,
                confidence=confidence,
                bbox=bbox
            ))
            total_confidence += confidence

        avg_confidence = total_confidence / len(all_regions) if all_regions else 0.0

        return {
            "text": "\n".join(all_text),
            "confidence": avg_confidence,
            "regions": all_regions
        }

    def extract_handwriting_regions(
        self,
        img: Image.Image,
        min_area: int = 500
    ) -> List[Dict[str, Any]]:
        """
        Erkennt und extrahiert handschriftliche Bereiche aus einem Bild.

        Nuetzlich fuer Klausuren mit gedruckten Fragen und handschriftlichen Antworten.

        Args:
            img: Eingabebild
            min_area: Minimale Flaeche fuer erkannte Regionen

        Returns:
            Liste von Regionen mit Koordinaten und erkanntem Text
        """
        # Bildvorverarbeitung
        cv_img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
        gray = cv2.cvtColor(cv_img, cv2.COLOR_BGR2GRAY)

        # Kanten erkennen
        edges = cv2.Canny(gray, 50, 150)

        # Morphologische Operationen zum Verbinden
        kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (15, 5))
        dilated = cv2.dilate(edges, kernel, iterations=2)

        # Konturen finden
        contours, _ = cv2.findContours(
            dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
        )

        regions = []
        for contour in contours:
            area = cv2.contourArea(contour)
            if area < min_area:
                continue

            x, y, w, h = cv2.boundingRect(contour)

            # Region ausschneiden
            region_img = img.crop((x, y, x + w, y + h))

            # OCR auf Region anwenden
            ocr_result = self.ocr_image(region_img)

            regions.append({
                "bbox": (x, y, x + w, y + h),
                "area": area,
                "text": ocr_result["text"],
                "confidence": ocr_result["confidence"]
            })

        # Nach Y-Position sortieren (oben nach unten)
        regions.sort(key=lambda r: r["bbox"][1])

        return regions