[split-required] [guardrail-change] Enforce 500 LOC budget across all services

Install LOC guardrails (check-loc.sh, architecture.md, pre-commit hook) and split all 44 files exceeding 500 LOC into domain-focused modules: - consent-service (Go): models, handlers, services, database splits - backend-core (Python): security_api, rbac_api, pdf_service, auth splits - admin-core (TypeScript): 5 page.tsx + sidebar extractions - pitch-deck (TypeScript): 6 slides, 3 UI components, engine.ts splits - voice-service (Python): enhanced_task_orchestrator split Result: 0 violations, 36 exempted (pipeline, tests, pure-data files). Go build verified clean. No behavior changes — pure structural splits. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-27 00:09:30 +02:00
parent 5ef039a6bc
commit 92c86ec6ba
162 changed files with 23853 additions and 23034 deletions
--- a/backend-core/services/image_processing.py
+++ b/backend-core/services/image_processing.py
@@ -0,0 +1,213 @@
+"""
+Image Processing and OCR Service.
+
+Handles:
+- Image preprocessing for better OCR results (grayscale, denoising, binarization)
+- PaddleOCR integration for text recognition
+- Handwriting region extraction from scanned documents
+
+Used by FileProcessor for image and PDF-to-image OCR workflows.
+"""
+
+import logging
+from typing import Optional, List, Dict, Any, Tuple
+
+import cv2
+import numpy as np
+from PIL import Image
+
+from .file_processor_types import ProcessedRegion
+
+logger = logging.getLogger(__name__)
+
+
+class ImageProcessor:
+    """
+    Image preprocessing and OCR for BreakPilot.
+
+    Supports:
+    - PaddleOCR for German handwriting and printed text
+    - OpenCV-based preprocessing (denoising, CLAHE, adaptive binarization)
+    - Handwriting region extraction for exam correction
+    """
+
+    def __init__(self, ocr_lang: str = "de", use_gpu: bool = False):
+        self.ocr_lang = ocr_lang
+        self.use_gpu = use_gpu
+        self._ocr_engine = None
+
+    @property
+    def ocr_engine(self):
+        """Lazy-Loading des OCR-Engines."""
+        if self._ocr_engine is None:
+            self._ocr_engine = self._init_ocr_engine()
+        return self._ocr_engine
+
+    def _init_ocr_engine(self):
+        """Initialisiert PaddleOCR oder Fallback."""
+        try:
+            from paddleocr import PaddleOCR
+            return PaddleOCR(
+                use_angle_cls=True,
+                lang='german',
+                use_gpu=self.use_gpu,
+                show_log=False
+            )
+        except ImportError:
+            logger.warning("PaddleOCR nicht installiert - verwende Fallback")
+            return None
+
+    def preprocess_image(self, img: Image.Image) -> Image.Image:
+        """
+        Vorverarbeitung des Bildes fuer bessere OCR-Ergebnisse.
+
+        - Konvertierung zu Graustufen
+        - Kontrastverstaerkung
+        - Rauschunterdrueckung
+        - Binarisierung
+        """
+        # PIL zu OpenCV
+        cv_img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
+
+        # Zu Graustufen konvertieren
+        gray = cv2.cvtColor(cv_img, cv2.COLOR_BGR2GRAY)
+
+        # Rauschunterdrueckung
+        denoised = cv2.fastNlMeansDenoising(gray, None, 10, 7, 21)
+
+        # Kontrastverstaerkung (CLAHE)
+        clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
+        enhanced = clahe.apply(denoised)
+
+        # Adaptive Binarisierung
+        binary = cv2.adaptiveThreshold(
+            enhanced,
+            255,
+            cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+            cv2.THRESH_BINARY,
+            11,
+            2
+        )
+
+        # Zurueck zu PIL
+        return Image.fromarray(binary)
+
+    def ocr_image(self, img: Image.Image) -> Dict[str, Any]:
+        """
+        Fuehrt OCR auf einem Bild aus.
+
+        Returns:
+            Dict mit text, confidence und regions
+        """
+        if self.ocr_engine is None:
+            return {
+                "text": "[OCR nicht verfuegbar - bitte PaddleOCR installieren]",
+                "confidence": 0.0,
+                "regions": []
+            }
+
+        # PIL zu numpy array
+        img_array = np.array(img)
+
+        # Wenn Graustufen, zu RGB konvertieren (PaddleOCR erwartet RGB)
+        if len(img_array.shape) == 2:
+            img_array = cv2.cvtColor(img_array, cv2.COLOR_GRAY2RGB)
+
+        # OCR ausfuehren
+        result = self.ocr_engine.ocr(img_array, cls=True)
+
+        if not result or not result[0]:
+            return {"text": "", "confidence": 0.0, "regions": []}
+
+        all_text = []
+        all_regions = []
+        total_confidence = 0.0
+
+        for line in result[0]:
+            bbox_points = line[0]  # [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
+            text, confidence = line[1]
+
+            # Bounding Box zu x1, y1, x2, y2 konvertieren
+            x_coords = [p[0] for p in bbox_points]
+            y_coords = [p[1] for p in bbox_points]
+            bbox = (
+                int(min(x_coords)),
+                int(min(y_coords)),
+                int(max(x_coords)),
+                int(max(y_coords))
+            )
+
+            all_text.append(text)
+            all_regions.append(ProcessedRegion(
+                text=text,
+                confidence=confidence,
+                bbox=bbox
+            ))
+            total_confidence += confidence
+
+        avg_confidence = total_confidence / len(all_regions) if all_regions else 0.0
+
+        return {
+            "text": "\n".join(all_text),
+            "confidence": avg_confidence,
+            "regions": all_regions
+        }
+
+    def extract_handwriting_regions(
+        self,
+        img: Image.Image,
+        min_area: int = 500
+    ) -> List[Dict[str, Any]]:
+        """
+        Erkennt und extrahiert handschriftliche Bereiche aus einem Bild.
+
+        Nuetzlich fuer Klausuren mit gedruckten Fragen und handschriftlichen Antworten.
+
+        Args:
+            img: Eingabebild
+            min_area: Minimale Flaeche fuer erkannte Regionen
+
+        Returns:
+            Liste von Regionen mit Koordinaten und erkanntem Text
+        """
+        # Bildvorverarbeitung
+        cv_img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
+        gray = cv2.cvtColor(cv_img, cv2.COLOR_BGR2GRAY)
+
+        # Kanten erkennen
+        edges = cv2.Canny(gray, 50, 150)
+
+        # Morphologische Operationen zum Verbinden
+        kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (15, 5))
+        dilated = cv2.dilate(edges, kernel, iterations=2)
+
+        # Konturen finden
+        contours, _ = cv2.findContours(
+            dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
+        )
+
+        regions = []
+        for contour in contours:
+            area = cv2.contourArea(contour)
+            if area < min_area:
+                continue
+
+            x, y, w, h = cv2.boundingRect(contour)
+
+            # Region ausschneiden
+            region_img = img.crop((x, y, x + w, y + h))
+
+            # OCR auf Region anwenden
+            ocr_result = self.ocr_image(region_img)
+
+            regions.append({
+                "bbox": (x, y, x + w, y + h),
+                "area": area,
+                "text": ocr_result["text"],
+                "confidence": ocr_result["confidence"]
+            })
+
+        # Nach Y-Position sortieren (oben nach unten)
+        regions.sort(key=lambda r: r["bbox"][1])
+
+        return regions