[split-required] Split remaining 500-680 LOC files (final batch)

website (17 pages + 3 components): - multiplayer/wizard, middleware/wizard+test-wizard, communication - builds/wizard, staff-search, voice, sbom/wizard - foerderantrag, mail/tasks, tools/communication, sbom - compliance/evidence, uni-crawler, brandbook (already done) - CollectionsTab, IngestionTab, RiskHeatmap backend-lehrer (5 files): - letters_api (641 → 2), certificates_api (636 → 2) - alerts_agent/db/models (636 → 3) - llm_gateway/communication_service (614 → 2) - game/database already done in prior batch klausur-service (2 files): - hybrid_vocab_extractor (664 → 2) - klausur-service/frontend: api.ts (620 → 3), EHUploadWizard (591 → 2) voice-service (3 files): - bqas/rag_judge (618 → 3), runner (529 → 2) - enhanced_task_orchestrator (519 → 2) studio-v2 (6 files): - korrektur/[klausurId] (578 → 4), fairness (569 → 2) - AlertsWizard (552 → 2), OnboardingWizard (513 → 2) - korrektur/api.ts (506 → 3), geo-lernwelt (501 → 2) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-25 08:56:45 +02:00
parent b4613e26f3
commit 451365a312
115 changed files with 10694 additions and 13839 deletions
--- a/klausur-service/backend/hybrid_vocab_ocr.py
+++ b/klausur-service/backend/hybrid_vocab_ocr.py
@@ -0,0 +1,300 @@
+"""
+Hybrid Vocab OCR - PaddleOCR integration and result parsing.
+
+Handles:
+- PaddleOCR lazy loading and initialization
+- Running OCR on image bytes
+- Parsing PaddleOCR v3 dict and traditional list formats
+- Grouping regions by rows and detecting columns
+"""
+
+import io
+import logging
+from typing import List, Tuple
+from dataclasses import dataclass
+
+import numpy as np
+from PIL import Image
+
+# OpenCV is optional
+try:
+    import cv2
+    CV2_AVAILABLE = True
+except ImportError:
+    cv2 = None
+    CV2_AVAILABLE = False
+
+logger = logging.getLogger(__name__)
+
+_paddle_ocr = None
+
+
+@dataclass
+class OCRRegion:
+    """Ein erkannter Textbereich mit Position."""
+    text: str
+    confidence: float
+    x1: int
+    y1: int
+    x2: int
+    y2: int
+
+    @property
+    def center_x(self) -> int:
+        return (self.x1 + self.x2) // 2
+
+    @property
+    def center_y(self) -> int:
+        return (self.y1 + self.y2) // 2
+
+
+def get_paddle_ocr():
+    """Lazy load PaddleOCR to avoid startup delay."""
+    global _paddle_ocr
+    if _paddle_ocr is None:
+        try:
+            from paddleocr import PaddleOCR
+            import logging as std_logging
+
+            for logger_name in ['ppocr', 'paddle', 'paddleocr', 'root']:
+                std_logging.getLogger(logger_name).setLevel(std_logging.WARNING)
+
+            try:
+                _paddle_ocr = PaddleOCR(lang="de")
+                logger.info("PaddleOCR 3.x initialized (lang=de)")
+            except Exception as e1:
+                logger.warning(f"PaddleOCR lang=de failed: {e1}")
+                try:
+                    _paddle_ocr = PaddleOCR(lang="en")
+                    logger.info("PaddleOCR 3.x initialized (lang=en)")
+                except Exception as e2:
+                    logger.warning(f"PaddleOCR lang=en failed: {e2}")
+                    _paddle_ocr = PaddleOCR()
+                    logger.info("PaddleOCR 3.x initialized (defaults)")
+
+        except Exception as e:
+            logger.error(f"PaddleOCR initialization failed: {e}")
+            _paddle_ocr = None
+
+    return _paddle_ocr
+
+
+def preprocess_image(img: Image.Image) -> np.ndarray:
+    """Bildvorverarbeitung fuer bessere OCR-Ergebnisse."""
+    if not CV2_AVAILABLE:
+        raise ImportError(
+            "OpenCV (cv2) is required for image preprocessing. "
+            "Install with: pip install opencv-python-headless"
+        )
+    img_array = np.array(img)
+    if len(img_array.shape) == 2:
+        img_array = cv2.cvtColor(img_array, cv2.COLOR_GRAY2RGB)
+    elif img_array.shape[2] == 4:
+        img_array = cv2.cvtColor(img_array, cv2.COLOR_RGBA2RGB)
+    return img_array
+
+
+def run_paddle_ocr(image_bytes: bytes) -> Tuple[List[OCRRegion], str]:
+    """Fuehrt PaddleOCR auf einem Bild aus."""
+    ocr = get_paddle_ocr()
+    if ocr is None:
+        logger.error("PaddleOCR not available")
+        return [], ""
+
+    try:
+        img = Image.open(io.BytesIO(image_bytes))
+        img_array = preprocess_image(img)
+
+        try:
+            result = ocr.ocr(img_array)
+        except TypeError:
+            logger.warning("Trying alternative OCR call method")
+            result = ocr.ocr(img_array)
+
+        if not result:
+            logger.warning("PaddleOCR returned empty result")
+            return [], ""
+
+        if isinstance(result, dict):
+            logger.info("Processing PaddleOCR 3.x dict format")
+            return _parse_paddleocr_v3_dict(result)
+        elif isinstance(result, list) and len(result) > 0:
+            first_item = result[0]
+            if first_item is None:
+                logger.warning("PaddleOCR returned None for first page")
+                return [], ""
+
+            if hasattr(first_item, 'get') or isinstance(first_item, dict):
+                item_dict = dict(first_item) if hasattr(first_item, 'items') else first_item
+                if 'rec_texts' in item_dict or 'texts' in item_dict:
+                    logger.info("Processing PaddleOCR 3.x OCRResult format")
+                    return _parse_paddleocr_v3_dict(item_dict)
+
+            if isinstance(first_item, list):
+                if len(first_item) > 0 and isinstance(first_item[0], (list, tuple)):
+                    logger.info("Processing PaddleOCR traditional list format")
+                    return _parse_paddleocr_list(first_item)
+
+            logger.warning(f"Unknown result format. Type: {type(first_item)}")
+            try:
+                item_dict = dict(first_item)
+                if 'rec_texts' in item_dict:
+                    return _parse_paddleocr_v3_dict(item_dict)
+            except Exception as e:
+                logger.warning(f"Could not convert to dict: {e}")
+            return [], ""
+        else:
+            logger.warning(f"Unexpected PaddleOCR result type: {type(result)}")
+            return [], ""
+
+    except Exception as e:
+        logger.error(f"PaddleOCR execution failed: {e}")
+        import traceback
+        logger.error(traceback.format_exc())
+        return [], ""
+
+
+def _parse_paddleocr_v3_dict(result: dict) -> Tuple[List[OCRRegion], str]:
+    """Parse PaddleOCR 3.x dict format result."""
+    regions = []
+    all_text_lines = []
+
+    texts = result.get('rec_texts', result.get('texts', []))
+    scores = result.get('rec_scores', result.get('scores', []))
+    polys = result.get('dt_polys', result.get('boxes', []))
+    rec_boxes = result.get('rec_boxes', [])
+
+    logger.info(f"PaddleOCR 3.x: {len(texts)} texts, {len(scores)} scores, {len(polys)} polys, {len(rec_boxes)} rec_boxes")
+
+    for i, (text, score) in enumerate(zip(texts, scores)):
+        if not text or not str(text).strip():
+            continue
+
+        x1, y1, x2, y2 = 0, 0, 100, 50
+
+        if i < len(rec_boxes) and rec_boxes[i] is not None:
+            box = rec_boxes[i]
+            try:
+                if hasattr(box, 'flatten'):
+                    box = box.flatten().tolist()
+                if len(box) >= 4:
+                    x1, y1, x2, y2 = int(box[0]), int(box[1]), int(box[2]), int(box[3])
+            except Exception as e:
+                logger.debug(f"Could not parse rec_box: {e}")
+
+        elif i < len(polys) and polys[i] is not None:
+            poly = polys[i]
+            try:
+                if hasattr(poly, 'tolist'):
+                    poly = poly.tolist()
+                if len(poly) >= 4:
+                    x_coords = [p[0] for p in poly]
+                    y_coords = [p[1] for p in poly]
+                    x1, y1 = int(min(x_coords)), int(min(y_coords))
+                    x2, y2 = int(max(x_coords)), int(max(y_coords))
+            except Exception as e:
+                logger.debug(f"Could not parse polygon: {e}")
+
+        region = OCRRegion(
+            text=text.strip(), confidence=float(score) if score else 0.5,
+            x1=x1, y1=y1, x2=x2, y2=y2
+        )
+        regions.append(region)
+        all_text_lines.append(text.strip())
+
+    regions.sort(key=lambda r: r.y1)
+    raw_text = "\n".join(all_text_lines)
+    logger.info(f"PaddleOCR 3.x extracted {len(regions)} text regions")
+    return regions, raw_text
+
+
+def _parse_paddleocr_list(page_result: list) -> Tuple[List[OCRRegion], str]:
+    """Parse PaddleOCR traditional list format result."""
+    regions = []
+    all_text_lines = []
+
+    for line in page_result:
+        if not line or len(line) < 2:
+            continue
+
+        bbox_points = line[0]
+        text_info = line[1]
+
+        if isinstance(text_info, tuple) and len(text_info) >= 2:
+            text, confidence = text_info[0], text_info[1]
+        elif isinstance(text_info, str):
+            text, confidence = text_info, 0.5
+        else:
+            continue
+
+        if not text or not text.strip():
+            continue
+
+        x_coords = [p[0] for p in bbox_points]
+        y_coords = [p[1] for p in bbox_points]
+
+        region = OCRRegion(
+            text=text.strip(), confidence=float(confidence),
+            x1=int(min(x_coords)), y1=int(min(y_coords)),
+            x2=int(max(x_coords)), y2=int(max(y_coords))
+        )
+        regions.append(region)
+        all_text_lines.append(text.strip())
+
+    regions.sort(key=lambda r: r.y1)
+    raw_text = "\n".join(all_text_lines)
+    logger.info(f"PaddleOCR extracted {len(regions)} text regions")
+    return regions, raw_text
+
+
+def group_regions_by_rows(regions: List[OCRRegion], y_tolerance: int = 20) -> List[List[OCRRegion]]:
+    """Gruppiert Textregionen in Zeilen basierend auf Y-Position."""
+    if not regions:
+        return []
+
+    rows = []
+    current_row = [regions[0]]
+    current_y = regions[0].center_y
+
+    for region in regions[1:]:
+        if abs(region.center_y - current_y) <= y_tolerance:
+            current_row.append(region)
+        else:
+            current_row.sort(key=lambda r: r.x1)
+            rows.append(current_row)
+            current_row = [region]
+            current_y = region.center_y
+
+    if current_row:
+        current_row.sort(key=lambda r: r.x1)
+        rows.append(current_row)
+
+    return rows
+
+
+def detect_columns(rows: List[List[OCRRegion]]) -> int:
+    """Erkennt die Anzahl der Spalten basierend auf den Textpositionen."""
+    if not rows:
+        return 2
+
+    items_per_row = [len(row) for row in rows if len(row) >= 2]
+    if not items_per_row:
+        return 2
+
+    avg_items = sum(items_per_row) / len(items_per_row)
+    return 3 if avg_items >= 2.5 else 2
+
+
+def format_ocr_for_llm(regions: List[OCRRegion]) -> str:
+    """Formatiert OCR-Output fuer LLM-Verarbeitung."""
+    rows = group_regions_by_rows(regions)
+    num_columns = detect_columns(rows)
+
+    lines = [f"Erkannte Spalten: {num_columns}", "---"]
+    for row in rows:
+        if len(row) >= 2:
+            lines.append("\t".join(r.text for r in row))
+        elif len(row) == 1:
+            lines.append(row[0].text)
+
+    return "\n".join(lines)