[split-required] Split final 43 files (500-668 LOC) to complete refactoring

klausur-service (11 files): - cv_gutter_repair, ocr_pipeline_regression, upload_api - ocr_pipeline_sessions, smart_spell, nru_worksheet_generator - ocr_pipeline_overlays, mail/aggregator, zeugnis_api - cv_syllable_detect, self_rag backend-lehrer (17 files): - classroom_engine/suggestions, generators/quiz_generator - worksheets_api, llm_gateway/comparison, state_engine_api - classroom/models (→ 4 submodules), services/file_processor - alerts_agent/api/wizard+digests+routes, content_generators/pdf - classroom/routes/sessions, llm_gateway/inference - classroom_engine/analytics, auth/keycloak_auth - alerts_agent/processing/rule_engine, ai_processor/print_versions agent-core (5 files): - brain/memory_store, brain/knowledge_graph, brain/context_manager - orchestrator/supervisor, sessions/session_manager admin-lehrer (5 components): - GridOverlay, StepGridReview, DevOpsPipelineSidebar - DataFlowDiagram, sbom/wizard/page website (2 files): - DependencyMap, lehrer/abitur-archiv Other: nibis_ingestion, grid_detection_service, export-doclayout-onnx Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-25 09:41:42 +02:00
parent 451365a312
commit bd4b956e3c
113 changed files with 13790 additions and 14148 deletions
@@ -15,60 +15,24 @@ Verwendet:
 """

 import logging
-import os
 import io
-import base64
 from pathlib import Path
-from typing import Optional, List, Dict, Any, Tuple, Union
-from dataclasses import dataclass
-from enum import Enum
+from typing import Optional, List, Dict, Any

 import cv2
 import numpy as np
 from PIL import Image

+from .file_processor_models import (
+    FileType,
+    ProcessingMode,
+    ProcessedRegion,
+    ProcessingResult,
+)
+
 logger = logging.getLogger(__name__)


-class FileType(str, Enum):
-    """Unterstützte Dateitypen."""
-    PDF = "pdf"
-    IMAGE = "image"
-    DOCX = "docx"
-    DOC = "doc"
-    TXT = "txt"
-    UNKNOWN = "unknown"
-
-
-class ProcessingMode(str, Enum):
-    """Verarbeitungsmodi."""
-    OCR_HANDWRITING = "ocr_handwriting"  # Handschrifterkennung
-    OCR_PRINTED = "ocr_printed"          # Gedruckter Text
-    TEXT_EXTRACT = "text_extract"        # Textextraktion (PDF/DOCX)
-    MIXED = "mixed"                       # Kombiniert OCR + Textextraktion
-
-
-@dataclass
-class ProcessedRegion:
-    """Ein erkannter Textbereich."""
-    text: str
-    confidence: float
-    bbox: Tuple[int, int, int, int]  # x1, y1, x2, y2
-    page: int = 1
-
-
-@dataclass
-class ProcessingResult:
-    """Ergebnis der Dokumentenverarbeitung."""
-    text: str
-    confidence: float
-    regions: List[ProcessedRegion]
-    page_count: int
-    file_type: FileType
-    processing_mode: ProcessingMode
-    metadata: Dict[str, Any]
-
-
 class FileProcessor:
    """
    Zentrale Dokumentenverarbeitung für BreakPilot.
@@ -81,17 +45,9 @@ class FileProcessor:
    """

    def __init__(self, ocr_lang: str = "de", use_gpu: bool = False):
-        """
-        Initialisiert den File Processor.
-
-        Args:
-            ocr_lang: Sprache für OCR (default: "de" für Deutsch)
-            use_gpu: GPU für OCR nutzen (beschleunigt Verarbeitung)
-        """
        self.ocr_lang = ocr_lang
        self.use_gpu = use_gpu
        self._ocr_engine = None
-
        logger.info(f"FileProcessor initialized (lang={ocr_lang}, gpu={use_gpu})")

    @property
@@ -107,7 +63,7 @@ class FileProcessor:
            from paddleocr import PaddleOCR
            return PaddleOCR(
                use_angle_cls=True,
-                lang='german',  # Deutsch
+                lang='german',
                use_gpu=self.use_gpu,
                show_log=False
            )
@@ -116,16 +72,7 @@ class FileProcessor:
            return None

    def detect_file_type(self, file_path: str = None, file_bytes: bytes = None) -> FileType:
-        """
-        Erkennt den Dateityp.
-
-        Args:
-            file_path: Pfad zur Datei
-            file_bytes: Dateiinhalt als Bytes
-
-        Returns:
-            FileType enum
-        """
+        """Erkennt den Dateityp."""
        if file_path:
            ext = Path(file_path).suffix.lower()
            if ext == ".pdf":
@@ -140,14 +87,13 @@ class FileProcessor:
                return FileType.TXT

        if file_bytes:
-            # Magic number detection
            if file_bytes[:4] == b'%PDF':
                return FileType.PDF
            elif file_bytes[:8] == b'\x89PNG\r\n\x1a\n':
                return FileType.IMAGE
-            elif file_bytes[:2] in [b'\xff\xd8', b'BM']:  # JPEG, BMP
+            elif file_bytes[:2] in [b'\xff\xd8', b'BM']:
                return FileType.IMAGE
-            elif file_bytes[:4] == b'PK\x03\x04':  # ZIP (DOCX)
+            elif file_bytes[:4] == b'PK\x03\x04':
                return FileType.DOCX

        return FileType.UNKNOWN
@@ -158,17 +104,7 @@ class FileProcessor:
        file_bytes: bytes = None,
        mode: ProcessingMode = ProcessingMode.MIXED
    ) -> ProcessingResult:
-        """
-        Verarbeitet ein Dokument.
-
-        Args:
-            file_path: Pfad zur Datei
-            file_bytes: Dateiinhalt als Bytes
-            mode: Verarbeitungsmodus
-
-        Returns:
-            ProcessingResult mit extrahiertem Text und Metadaten
-        """
+        """Verarbeitet ein Dokument."""
        if not file_path and not file_bytes:
            raise ValueError("Entweder file_path oder file_bytes muss angegeben werden")

@@ -186,18 +122,12 @@ class FileProcessor:
        else:
            raise ValueError(f"Nicht unterstützter Dateityp: {file_type}")

-    def _process_pdf(
-        self,
-        file_path: str = None,
-        file_bytes: bytes = None,
-        mode: ProcessingMode = ProcessingMode.MIXED
-    ) -> ProcessingResult:
+    def _process_pdf(self, file_path=None, file_bytes=None, mode=ProcessingMode.MIXED):
        """Verarbeitet PDF-Dateien."""
        try:
-            import fitz  # PyMuPDF
+            import fitz
        except ImportError:
            logger.warning("PyMuPDF nicht installiert - versuche Fallback")
-            # Fallback: PDF als Bild behandeln
            return self._process_image(file_path, file_bytes, mode)

        if file_bytes:
@@ -205,35 +135,27 @@ class FileProcessor:
        else:
            doc = fitz.open(file_path)

-        all_text = []
-        all_regions = []
-        total_confidence = 0.0
-        region_count = 0
+        all_text, all_regions = [], []
+        total_confidence, region_count = 0.0, 0

        for page_num, page in enumerate(doc, start=1):
-            # Erst versuchen Text direkt zu extrahieren
            page_text = page.get_text()

            if page_text.strip() and mode != ProcessingMode.OCR_HANDWRITING:
-                # PDF enthält Text (nicht nur Bilder)
                all_text.append(page_text)
                all_regions.append(ProcessedRegion(
-                    text=page_text,
-                    confidence=1.0,
+                    text=page_text, confidence=1.0,
                    bbox=(0, 0, int(page.rect.width), int(page.rect.height)),
                    page=page_num
                ))
                total_confidence += 1.0
                region_count += 1
            else:
-                # Seite als Bild rendern und OCR anwenden
-                pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))  # 2x Auflösung
+                pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
                img_bytes = pix.tobytes("png")
                img = Image.open(io.BytesIO(img_bytes))
-
                ocr_result = self._ocr_image(img)
                all_text.append(ocr_result["text"])
-
                for region in ocr_result["regions"]:
                    region.page = page_num
                    all_regions.append(region)
@@ -241,55 +163,34 @@ class FileProcessor:
                    region_count += 1

        doc.close()
-
        avg_confidence = total_confidence / region_count if region_count > 0 else 0.0

        return ProcessingResult(
-            text="\n\n".join(all_text),
-            confidence=avg_confidence,
+            text="\n\n".join(all_text), confidence=avg_confidence,
            regions=all_regions,
            page_count=len(doc) if hasattr(doc, '__len__') else 1,
-            file_type=FileType.PDF,
-            processing_mode=mode,
+            file_type=FileType.PDF, processing_mode=mode,
            metadata={"source": file_path or "bytes"}
        )

-    def _process_image(
-        self,
-        file_path: str = None,
-        file_bytes: bytes = None,
-        mode: ProcessingMode = ProcessingMode.MIXED
-    ) -> ProcessingResult:
+    def _process_image(self, file_path=None, file_bytes=None, mode=ProcessingMode.MIXED):
        """Verarbeitet Bilddateien."""
        if file_bytes:
            img = Image.open(io.BytesIO(file_bytes))
        else:
            img = Image.open(file_path)

-        # Bildvorverarbeitung
        processed_img = self._preprocess_image(img)
-
-        # OCR
        ocr_result = self._ocr_image(processed_img)

        return ProcessingResult(
-            text=ocr_result["text"],
-            confidence=ocr_result["confidence"],
-            regions=ocr_result["regions"],
-            page_count=1,
-            file_type=FileType.IMAGE,
-            processing_mode=mode,
-            metadata={
-                "source": file_path or "bytes",
-                "image_size": img.size
-            }
+            text=ocr_result["text"], confidence=ocr_result["confidence"],
+            regions=ocr_result["regions"], page_count=1,
+            file_type=FileType.IMAGE, processing_mode=mode,
+            metadata={"source": file_path or "bytes", "image_size": img.size}
        )

-    def _process_docx(
-        self,
-        file_path: str = None,
-        file_bytes: bytes = None
-    ) -> ProcessingResult:
+    def _process_docx(self, file_path=None, file_bytes=None):
        """Verarbeitet DOCX-Dateien."""
        try:
            from docx import Document
@@ -306,7 +207,6 @@ class FileProcessor:
            if para.text.strip():
                paragraphs.append(para.text)

-        # Auch Tabellen extrahieren
        for table in doc.tables:
            for row in table.rows:
                row_text = " | ".join(cell.text for cell in row.cells)
@@ -316,25 +216,14 @@ class FileProcessor:
        text = "\n\n".join(paragraphs)

        return ProcessingResult(
-            text=text,
-            confidence=1.0,  # Direkte Textextraktion
-            regions=[ProcessedRegion(
-                text=text,
-                confidence=1.0,
-                bbox=(0, 0, 0, 0),
-                page=1
-            )],
-            page_count=1,
-            file_type=FileType.DOCX,
+            text=text, confidence=1.0,
+            regions=[ProcessedRegion(text=text, confidence=1.0, bbox=(0, 0, 0, 0), page=1)],
+            page_count=1, file_type=FileType.DOCX,
            processing_mode=ProcessingMode.TEXT_EXTRACT,
            metadata={"source": file_path or "bytes"}
        )

-    def _process_txt(
-        self,
-        file_path: str = None,
-        file_bytes: bytes = None
-    ) -> ProcessingResult:
+    def _process_txt(self, file_path=None, file_bytes=None):
        """Verarbeitet Textdateien."""
        if file_bytes:
            text = file_bytes.decode('utf-8', errors='ignore')
@@ -343,146 +232,65 @@ class FileProcessor:
                text = f.read()

        return ProcessingResult(
-            text=text,
-            confidence=1.0,
-            regions=[ProcessedRegion(
-                text=text,
-                confidence=1.0,
-                bbox=(0, 0, 0, 0),
-                page=1
-            )],
-            page_count=1,
-            file_type=FileType.TXT,
+            text=text, confidence=1.0,
+            regions=[ProcessedRegion(text=text, confidence=1.0, bbox=(0, 0, 0, 0), page=1)],
+            page_count=1, file_type=FileType.TXT,
            processing_mode=ProcessingMode.TEXT_EXTRACT,
            metadata={"source": file_path or "bytes"}
        )

    def _preprocess_image(self, img: Image.Image) -> Image.Image:
-        """
-        Vorverarbeitung des Bildes für bessere OCR-Ergebnisse.
-
-        - Konvertierung zu Graustufen
-        - Kontrastverstärkung
-        - Rauschunterdrückung
-        - Binarisierung
-        """
-        # PIL zu OpenCV
+        """Vorverarbeitung des Bildes für bessere OCR-Ergebnisse."""
        cv_img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
-
-        # Zu Graustufen konvertieren
        gray = cv2.cvtColor(cv_img, cv2.COLOR_BGR2GRAY)
-
-        # Rauschunterdrückung
        denoised = cv2.fastNlMeansDenoising(gray, None, 10, 7, 21)
-
-        # Kontrastverstärkung (CLAHE)
        clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
        enhanced = clahe.apply(denoised)
-
-        # Adaptive Binarisierung
        binary = cv2.adaptiveThreshold(
-            enhanced,
-            255,
-            cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
-            cv2.THRESH_BINARY,
-            11,
-            2
+            enhanced, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+            cv2.THRESH_BINARY, 11, 2
        )
-
-        # Zurück zu PIL
        return Image.fromarray(binary)

    def _ocr_image(self, img: Image.Image) -> Dict[str, Any]:
-        """
-        Führt OCR auf einem Bild aus.
-
-        Returns:
-            Dict mit text, confidence und regions
-        """
+        """Führt OCR auf einem Bild aus."""
        if self.ocr_engine is None:
-            # Fallback wenn kein OCR-Engine verfügbar
-            return {
-                "text": "[OCR nicht verfügbar - bitte PaddleOCR installieren]",
-                "confidence": 0.0,
-                "regions": []
-            }
+            return {"text": "[OCR nicht verfügbar - bitte PaddleOCR installieren]",
+                    "confidence": 0.0, "regions": []}

-        # PIL zu numpy array
        img_array = np.array(img)
-
-        # Wenn Graustufen, zu RGB konvertieren (PaddleOCR erwartet RGB)
        if len(img_array.shape) == 2:
            img_array = cv2.cvtColor(img_array, cv2.COLOR_GRAY2RGB)

-        # OCR ausführen
        result = self.ocr_engine.ocr(img_array, cls=True)

        if not result or not result[0]:
            return {"text": "", "confidence": 0.0, "regions": []}

-        all_text = []
-        all_regions = []
+        all_text, all_regions = [], []
        total_confidence = 0.0

        for line in result[0]:
-            bbox_points = line[0]  # [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
+            bbox_points = line[0]
            text, confidence = line[1]
-
-            # Bounding Box zu x1, y1, x2, y2 konvertieren
            x_coords = [p[0] for p in bbox_points]
            y_coords = [p[1] for p in bbox_points]
-            bbox = (
-                int(min(x_coords)),
-                int(min(y_coords)),
-                int(max(x_coords)),
-                int(max(y_coords))
-            )
-
+            bbox = (int(min(x_coords)), int(min(y_coords)),
+                    int(max(x_coords)), int(max(y_coords)))
            all_text.append(text)
-            all_regions.append(ProcessedRegion(
-                text=text,
-                confidence=confidence,
-                bbox=bbox
-            ))
+            all_regions.append(ProcessedRegion(text=text, confidence=confidence, bbox=bbox))
            total_confidence += confidence

        avg_confidence = total_confidence / len(all_regions) if all_regions else 0.0
+        return {"text": "\n".join(all_text), "confidence": avg_confidence, "regions": all_regions}

-        return {
-            "text": "\n".join(all_text),
-            "confidence": avg_confidence,
-            "regions": all_regions
-        }
-
-    def extract_handwriting_regions(
-        self,
-        img: Image.Image,
-        min_area: int = 500
-    ) -> List[Dict[str, Any]]:
-        """
-        Erkennt und extrahiert handschriftliche Bereiche aus einem Bild.
-
-        Nützlich für Klausuren mit gedruckten Fragen und handschriftlichen Antworten.
-
-        Args:
-            img: Eingabebild
-            min_area: Minimale Fläche für erkannte Regionen
-
-        Returns:
-            Liste von Regionen mit Koordinaten und erkanntem Text
-        """
-        # Bildvorverarbeitung
+    def extract_handwriting_regions(self, img: Image.Image, min_area: int = 500) -> List[Dict[str, Any]]:
+        """Erkennt und extrahiert handschriftliche Bereiche aus einem Bild."""
        cv_img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
        gray = cv2.cvtColor(cv_img, cv2.COLOR_BGR2GRAY)
-
-        # Kanten erkennen
        edges = cv2.Canny(gray, 50, 150)
-
-        # Morphologische Operationen zum Verbinden
        kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (15, 5))
        dilated = cv2.dilate(edges, kernel, iterations=2)
-
-        # Konturen finden
        contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

        regions = []
@@ -490,25 +298,15 @@ class FileProcessor:
            area = cv2.contourArea(contour)
            if area < min_area:
                continue
-
            x, y, w, h = cv2.boundingRect(contour)
-
-            # Region ausschneiden
            region_img = img.crop((x, y, x + w, y + h))
-
-            # OCR auf Region anwenden
            ocr_result = self._ocr_image(region_img)
-
            regions.append({
-                "bbox": (x, y, x + w, y + h),
-                "area": area,
-                "text": ocr_result["text"],
-                "confidence": ocr_result["confidence"]
+                "bbox": (x, y, x + w, y + h), "area": area,
+                "text": ocr_result["text"], "confidence": ocr_result["confidence"]
            })

-        # Nach Y-Position sortieren (oben nach unten)
        regions.sort(key=lambda r: r["bbox"][1])
-
        return regions


@@ -525,39 +323,25 @@ def get_file_processor() -> FileProcessor:


 # Convenience functions
-def process_file(
-    file_path: str = None,
-    file_bytes: bytes = None,
-    mode: ProcessingMode = ProcessingMode.MIXED
-) -> ProcessingResult:
-    """
-    Convenience function zum Verarbeiten einer Datei.
-
-    Args:
-        file_path: Pfad zur Datei
-        file_bytes: Dateiinhalt als Bytes
-        mode: Verarbeitungsmodus
-
-    Returns:
-        ProcessingResult
-    """
+def process_file(file_path=None, file_bytes=None, mode=ProcessingMode.MIXED) -> ProcessingResult:
+    """Convenience function zum Verarbeiten einer Datei."""
    processor = get_file_processor()
    return processor.process(file_path, file_bytes, mode)


-def extract_text_from_pdf(file_path: str = None, file_bytes: bytes = None) -> str:
+def extract_text_from_pdf(file_path=None, file_bytes=None) -> str:
    """Extrahiert Text aus einer PDF-Datei."""
    result = process_file(file_path, file_bytes, ProcessingMode.TEXT_EXTRACT)
    return result.text


-def ocr_image(file_path: str = None, file_bytes: bytes = None) -> str:
+def ocr_image(file_path=None, file_bytes=None) -> str:
    """Führt OCR auf einem Bild aus."""
    result = process_file(file_path, file_bytes, ProcessingMode.OCR_PRINTED)
    return result.text


-def ocr_handwriting(file_path: str = None, file_bytes: bytes = None) -> str:
+def ocr_handwriting(file_path=None, file_bytes=None) -> str:
    """Führt Handschrift-OCR auf einem Bild aus."""
    result = process_file(file_path, file_bytes, ProcessingMode.OCR_HANDWRITING)
    return result.text
@@ -0,0 +1,48 @@
+"""
+File Processor - Datenmodelle und Enums.
+
+Typen fuer Dokumentenverarbeitung: Dateitypen, Modi, Ergebnisse.
+"""
+
+from typing import List, Dict, Any, Tuple
+from dataclasses import dataclass
+from enum import Enum
+
+
+class FileType(str, Enum):
+    """Unterstützte Dateitypen."""
+    PDF = "pdf"
+    IMAGE = "image"
+    DOCX = "docx"
+    DOC = "doc"
+    TXT = "txt"
+    UNKNOWN = "unknown"
+
+
+class ProcessingMode(str, Enum):
+    """Verarbeitungsmodi."""
+    OCR_HANDWRITING = "ocr_handwriting"  # Handschrifterkennung
+    OCR_PRINTED = "ocr_printed"          # Gedruckter Text
+    TEXT_EXTRACT = "text_extract"        # Textextraktion (PDF/DOCX)
+    MIXED = "mixed"                       # Kombiniert OCR + Textextraktion
+
+
+@dataclass
+class ProcessedRegion:
+    """Ein erkannter Textbereich."""
+    text: str
+    confidence: float
+    bbox: Tuple[int, int, int, int]  # x1, y1, x2, y2
+    page: int = 1
+
+
+@dataclass
+class ProcessingResult:
+    """Ergebnis der Dokumentenverarbeitung."""
+    text: str
+    confidence: float
+    regions: List[ProcessedRegion]
+    page_count: int
+    file_type: FileType
+    processing_mode: ProcessingMode
+    metadata: Dict[str, Any]