[split-required] Split remaining 500-680 LOC files (final batch)

website (17 pages + 3 components):
- multiplayer/wizard, middleware/wizard+test-wizard, communication
- builds/wizard, staff-search, voice, sbom/wizard
- foerderantrag, mail/tasks, tools/communication, sbom
- compliance/evidence, uni-crawler, brandbook (already done)
- CollectionsTab, IngestionTab, RiskHeatmap

backend-lehrer (5 files):
- letters_api (641 → 2), certificates_api (636 → 2)
- alerts_agent/db/models (636 → 3)
- llm_gateway/communication_service (614 → 2)
- game/database already done in prior batch

klausur-service (2 files):
- hybrid_vocab_extractor (664 → 2)
- klausur-service/frontend: api.ts (620 → 3), EHUploadWizard (591 → 2)

voice-service (3 files):
- bqas/rag_judge (618 → 3), runner (529 → 2)
- enhanced_task_orchestrator (519 → 2)

studio-v2 (6 files):
- korrektur/[klausurId] (578 → 4), fairness (569 → 2)
- AlertsWizard (552 → 2), OnboardingWizard (513 → 2)
- korrektur/api.ts (506 → 3), geo-lernwelt (501 → 2)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-04-25 08:56:45 +02:00
parent b4613e26f3
commit 451365a312
115 changed files with 10694 additions and 13839 deletions

View File

@@ -0,0 +1,300 @@
"""
Hybrid Vocab OCR - PaddleOCR integration and result parsing.
Handles:
- PaddleOCR lazy loading and initialization
- Running OCR on image bytes
- Parsing PaddleOCR v3 dict and traditional list formats
- Grouping regions by rows and detecting columns
"""
import io
import logging
from typing import List, Tuple
from dataclasses import dataclass
import numpy as np
from PIL import Image
# OpenCV is optional
try:
import cv2
CV2_AVAILABLE = True
except ImportError:
cv2 = None
CV2_AVAILABLE = False
logger = logging.getLogger(__name__)
_paddle_ocr = None
@dataclass
class OCRRegion:
"""Ein erkannter Textbereich mit Position."""
text: str
confidence: float
x1: int
y1: int
x2: int
y2: int
@property
def center_x(self) -> int:
return (self.x1 + self.x2) // 2
@property
def center_y(self) -> int:
return (self.y1 + self.y2) // 2
def get_paddle_ocr():
"""Lazy load PaddleOCR to avoid startup delay."""
global _paddle_ocr
if _paddle_ocr is None:
try:
from paddleocr import PaddleOCR
import logging as std_logging
for logger_name in ['ppocr', 'paddle', 'paddleocr', 'root']:
std_logging.getLogger(logger_name).setLevel(std_logging.WARNING)
try:
_paddle_ocr = PaddleOCR(lang="de")
logger.info("PaddleOCR 3.x initialized (lang=de)")
except Exception as e1:
logger.warning(f"PaddleOCR lang=de failed: {e1}")
try:
_paddle_ocr = PaddleOCR(lang="en")
logger.info("PaddleOCR 3.x initialized (lang=en)")
except Exception as e2:
logger.warning(f"PaddleOCR lang=en failed: {e2}")
_paddle_ocr = PaddleOCR()
logger.info("PaddleOCR 3.x initialized (defaults)")
except Exception as e:
logger.error(f"PaddleOCR initialization failed: {e}")
_paddle_ocr = None
return _paddle_ocr
def preprocess_image(img: Image.Image) -> np.ndarray:
"""Bildvorverarbeitung fuer bessere OCR-Ergebnisse."""
if not CV2_AVAILABLE:
raise ImportError(
"OpenCV (cv2) is required for image preprocessing. "
"Install with: pip install opencv-python-headless"
)
img_array = np.array(img)
if len(img_array.shape) == 2:
img_array = cv2.cvtColor(img_array, cv2.COLOR_GRAY2RGB)
elif img_array.shape[2] == 4:
img_array = cv2.cvtColor(img_array, cv2.COLOR_RGBA2RGB)
return img_array
def run_paddle_ocr(image_bytes: bytes) -> Tuple[List[OCRRegion], str]:
"""Fuehrt PaddleOCR auf einem Bild aus."""
ocr = get_paddle_ocr()
if ocr is None:
logger.error("PaddleOCR not available")
return [], ""
try:
img = Image.open(io.BytesIO(image_bytes))
img_array = preprocess_image(img)
try:
result = ocr.ocr(img_array)
except TypeError:
logger.warning("Trying alternative OCR call method")
result = ocr.ocr(img_array)
if not result:
logger.warning("PaddleOCR returned empty result")
return [], ""
if isinstance(result, dict):
logger.info("Processing PaddleOCR 3.x dict format")
return _parse_paddleocr_v3_dict(result)
elif isinstance(result, list) and len(result) > 0:
first_item = result[0]
if first_item is None:
logger.warning("PaddleOCR returned None for first page")
return [], ""
if hasattr(first_item, 'get') or isinstance(first_item, dict):
item_dict = dict(first_item) if hasattr(first_item, 'items') else first_item
if 'rec_texts' in item_dict or 'texts' in item_dict:
logger.info("Processing PaddleOCR 3.x OCRResult format")
return _parse_paddleocr_v3_dict(item_dict)
if isinstance(first_item, list):
if len(first_item) > 0 and isinstance(first_item[0], (list, tuple)):
logger.info("Processing PaddleOCR traditional list format")
return _parse_paddleocr_list(first_item)
logger.warning(f"Unknown result format. Type: {type(first_item)}")
try:
item_dict = dict(first_item)
if 'rec_texts' in item_dict:
return _parse_paddleocr_v3_dict(item_dict)
except Exception as e:
logger.warning(f"Could not convert to dict: {e}")
return [], ""
else:
logger.warning(f"Unexpected PaddleOCR result type: {type(result)}")
return [], ""
except Exception as e:
logger.error(f"PaddleOCR execution failed: {e}")
import traceback
logger.error(traceback.format_exc())
return [], ""
def _parse_paddleocr_v3_dict(result: dict) -> Tuple[List[OCRRegion], str]:
"""Parse PaddleOCR 3.x dict format result."""
regions = []
all_text_lines = []
texts = result.get('rec_texts', result.get('texts', []))
scores = result.get('rec_scores', result.get('scores', []))
polys = result.get('dt_polys', result.get('boxes', []))
rec_boxes = result.get('rec_boxes', [])
logger.info(f"PaddleOCR 3.x: {len(texts)} texts, {len(scores)} scores, {len(polys)} polys, {len(rec_boxes)} rec_boxes")
for i, (text, score) in enumerate(zip(texts, scores)):
if not text or not str(text).strip():
continue
x1, y1, x2, y2 = 0, 0, 100, 50
if i < len(rec_boxes) and rec_boxes[i] is not None:
box = rec_boxes[i]
try:
if hasattr(box, 'flatten'):
box = box.flatten().tolist()
if len(box) >= 4:
x1, y1, x2, y2 = int(box[0]), int(box[1]), int(box[2]), int(box[3])
except Exception as e:
logger.debug(f"Could not parse rec_box: {e}")
elif i < len(polys) and polys[i] is not None:
poly = polys[i]
try:
if hasattr(poly, 'tolist'):
poly = poly.tolist()
if len(poly) >= 4:
x_coords = [p[0] for p in poly]
y_coords = [p[1] for p in poly]
x1, y1 = int(min(x_coords)), int(min(y_coords))
x2, y2 = int(max(x_coords)), int(max(y_coords))
except Exception as e:
logger.debug(f"Could not parse polygon: {e}")
region = OCRRegion(
text=text.strip(), confidence=float(score) if score else 0.5,
x1=x1, y1=y1, x2=x2, y2=y2
)
regions.append(region)
all_text_lines.append(text.strip())
regions.sort(key=lambda r: r.y1)
raw_text = "\n".join(all_text_lines)
logger.info(f"PaddleOCR 3.x extracted {len(regions)} text regions")
return regions, raw_text
def _parse_paddleocr_list(page_result: list) -> Tuple[List[OCRRegion], str]:
"""Parse PaddleOCR traditional list format result."""
regions = []
all_text_lines = []
for line in page_result:
if not line or len(line) < 2:
continue
bbox_points = line[0]
text_info = line[1]
if isinstance(text_info, tuple) and len(text_info) >= 2:
text, confidence = text_info[0], text_info[1]
elif isinstance(text_info, str):
text, confidence = text_info, 0.5
else:
continue
if not text or not text.strip():
continue
x_coords = [p[0] for p in bbox_points]
y_coords = [p[1] for p in bbox_points]
region = OCRRegion(
text=text.strip(), confidence=float(confidence),
x1=int(min(x_coords)), y1=int(min(y_coords)),
x2=int(max(x_coords)), y2=int(max(y_coords))
)
regions.append(region)
all_text_lines.append(text.strip())
regions.sort(key=lambda r: r.y1)
raw_text = "\n".join(all_text_lines)
logger.info(f"PaddleOCR extracted {len(regions)} text regions")
return regions, raw_text
def group_regions_by_rows(regions: List[OCRRegion], y_tolerance: int = 20) -> List[List[OCRRegion]]:
"""Gruppiert Textregionen in Zeilen basierend auf Y-Position."""
if not regions:
return []
rows = []
current_row = [regions[0]]
current_y = regions[0].center_y
for region in regions[1:]:
if abs(region.center_y - current_y) <= y_tolerance:
current_row.append(region)
else:
current_row.sort(key=lambda r: r.x1)
rows.append(current_row)
current_row = [region]
current_y = region.center_y
if current_row:
current_row.sort(key=lambda r: r.x1)
rows.append(current_row)
return rows
def detect_columns(rows: List[List[OCRRegion]]) -> int:
"""Erkennt die Anzahl der Spalten basierend auf den Textpositionen."""
if not rows:
return 2
items_per_row = [len(row) for row in rows if len(row) >= 2]
if not items_per_row:
return 2
avg_items = sum(items_per_row) / len(items_per_row)
return 3 if avg_items >= 2.5 else 2
def format_ocr_for_llm(regions: List[OCRRegion]) -> str:
"""Formatiert OCR-Output fuer LLM-Verarbeitung."""
rows = group_regions_by_rows(regions)
num_columns = detect_columns(rows)
lines = [f"Erkannte Spalten: {num_columns}", "---"]
for row in rows:
if len(row) >= 2:
lines.append("\t".join(r.text for r in row))
elif len(row) == 1:
lines.append(row[0].text)
return "\n".join(lines)