[split-required] Split remaining 500-680 LOC files (final batch)
website (17 pages + 3 components): - multiplayer/wizard, middleware/wizard+test-wizard, communication - builds/wizard, staff-search, voice, sbom/wizard - foerderantrag, mail/tasks, tools/communication, sbom - compliance/evidence, uni-crawler, brandbook (already done) - CollectionsTab, IngestionTab, RiskHeatmap backend-lehrer (5 files): - letters_api (641 → 2), certificates_api (636 → 2) - alerts_agent/db/models (636 → 3) - llm_gateway/communication_service (614 → 2) - game/database already done in prior batch klausur-service (2 files): - hybrid_vocab_extractor (664 → 2) - klausur-service/frontend: api.ts (620 → 3), EHUploadWizard (591 → 2) voice-service (3 files): - bqas/rag_judge (618 → 3), runner (529 → 2) - enhanced_task_orchestrator (519 → 2) studio-v2 (6 files): - korrektur/[klausurId] (578 → 4), fairness (569 → 2) - AlertsWizard (552 → 2), OnboardingWizard (513 → 2) - korrektur/api.ts (506 → 3), geo-lernwelt (501 → 2) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
300
klausur-service/backend/hybrid_vocab_ocr.py
Normal file
300
klausur-service/backend/hybrid_vocab_ocr.py
Normal file
@@ -0,0 +1,300 @@
|
||||
"""
|
||||
Hybrid Vocab OCR - PaddleOCR integration and result parsing.
|
||||
|
||||
Handles:
|
||||
- PaddleOCR lazy loading and initialization
|
||||
- Running OCR on image bytes
|
||||
- Parsing PaddleOCR v3 dict and traditional list formats
|
||||
- Grouping regions by rows and detecting columns
|
||||
"""
|
||||
|
||||
import io
|
||||
import logging
|
||||
from typing import List, Tuple
|
||||
from dataclasses import dataclass
|
||||
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
|
||||
# OpenCV is optional
|
||||
try:
|
||||
import cv2
|
||||
CV2_AVAILABLE = True
|
||||
except ImportError:
|
||||
cv2 = None
|
||||
CV2_AVAILABLE = False
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_paddle_ocr = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class OCRRegion:
|
||||
"""Ein erkannter Textbereich mit Position."""
|
||||
text: str
|
||||
confidence: float
|
||||
x1: int
|
||||
y1: int
|
||||
x2: int
|
||||
y2: int
|
||||
|
||||
@property
|
||||
def center_x(self) -> int:
|
||||
return (self.x1 + self.x2) // 2
|
||||
|
||||
@property
|
||||
def center_y(self) -> int:
|
||||
return (self.y1 + self.y2) // 2
|
||||
|
||||
|
||||
def get_paddle_ocr():
|
||||
"""Lazy load PaddleOCR to avoid startup delay."""
|
||||
global _paddle_ocr
|
||||
if _paddle_ocr is None:
|
||||
try:
|
||||
from paddleocr import PaddleOCR
|
||||
import logging as std_logging
|
||||
|
||||
for logger_name in ['ppocr', 'paddle', 'paddleocr', 'root']:
|
||||
std_logging.getLogger(logger_name).setLevel(std_logging.WARNING)
|
||||
|
||||
try:
|
||||
_paddle_ocr = PaddleOCR(lang="de")
|
||||
logger.info("PaddleOCR 3.x initialized (lang=de)")
|
||||
except Exception as e1:
|
||||
logger.warning(f"PaddleOCR lang=de failed: {e1}")
|
||||
try:
|
||||
_paddle_ocr = PaddleOCR(lang="en")
|
||||
logger.info("PaddleOCR 3.x initialized (lang=en)")
|
||||
except Exception as e2:
|
||||
logger.warning(f"PaddleOCR lang=en failed: {e2}")
|
||||
_paddle_ocr = PaddleOCR()
|
||||
logger.info("PaddleOCR 3.x initialized (defaults)")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"PaddleOCR initialization failed: {e}")
|
||||
_paddle_ocr = None
|
||||
|
||||
return _paddle_ocr
|
||||
|
||||
|
||||
def preprocess_image(img: Image.Image) -> np.ndarray:
|
||||
"""Bildvorverarbeitung fuer bessere OCR-Ergebnisse."""
|
||||
if not CV2_AVAILABLE:
|
||||
raise ImportError(
|
||||
"OpenCV (cv2) is required for image preprocessing. "
|
||||
"Install with: pip install opencv-python-headless"
|
||||
)
|
||||
img_array = np.array(img)
|
||||
if len(img_array.shape) == 2:
|
||||
img_array = cv2.cvtColor(img_array, cv2.COLOR_GRAY2RGB)
|
||||
elif img_array.shape[2] == 4:
|
||||
img_array = cv2.cvtColor(img_array, cv2.COLOR_RGBA2RGB)
|
||||
return img_array
|
||||
|
||||
|
||||
def run_paddle_ocr(image_bytes: bytes) -> Tuple[List[OCRRegion], str]:
|
||||
"""Fuehrt PaddleOCR auf einem Bild aus."""
|
||||
ocr = get_paddle_ocr()
|
||||
if ocr is None:
|
||||
logger.error("PaddleOCR not available")
|
||||
return [], ""
|
||||
|
||||
try:
|
||||
img = Image.open(io.BytesIO(image_bytes))
|
||||
img_array = preprocess_image(img)
|
||||
|
||||
try:
|
||||
result = ocr.ocr(img_array)
|
||||
except TypeError:
|
||||
logger.warning("Trying alternative OCR call method")
|
||||
result = ocr.ocr(img_array)
|
||||
|
||||
if not result:
|
||||
logger.warning("PaddleOCR returned empty result")
|
||||
return [], ""
|
||||
|
||||
if isinstance(result, dict):
|
||||
logger.info("Processing PaddleOCR 3.x dict format")
|
||||
return _parse_paddleocr_v3_dict(result)
|
||||
elif isinstance(result, list) and len(result) > 0:
|
||||
first_item = result[0]
|
||||
if first_item is None:
|
||||
logger.warning("PaddleOCR returned None for first page")
|
||||
return [], ""
|
||||
|
||||
if hasattr(first_item, 'get') or isinstance(first_item, dict):
|
||||
item_dict = dict(first_item) if hasattr(first_item, 'items') else first_item
|
||||
if 'rec_texts' in item_dict or 'texts' in item_dict:
|
||||
logger.info("Processing PaddleOCR 3.x OCRResult format")
|
||||
return _parse_paddleocr_v3_dict(item_dict)
|
||||
|
||||
if isinstance(first_item, list):
|
||||
if len(first_item) > 0 and isinstance(first_item[0], (list, tuple)):
|
||||
logger.info("Processing PaddleOCR traditional list format")
|
||||
return _parse_paddleocr_list(first_item)
|
||||
|
||||
logger.warning(f"Unknown result format. Type: {type(first_item)}")
|
||||
try:
|
||||
item_dict = dict(first_item)
|
||||
if 'rec_texts' in item_dict:
|
||||
return _parse_paddleocr_v3_dict(item_dict)
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not convert to dict: {e}")
|
||||
return [], ""
|
||||
else:
|
||||
logger.warning(f"Unexpected PaddleOCR result type: {type(result)}")
|
||||
return [], ""
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"PaddleOCR execution failed: {e}")
|
||||
import traceback
|
||||
logger.error(traceback.format_exc())
|
||||
return [], ""
|
||||
|
||||
|
||||
def _parse_paddleocr_v3_dict(result: dict) -> Tuple[List[OCRRegion], str]:
|
||||
"""Parse PaddleOCR 3.x dict format result."""
|
||||
regions = []
|
||||
all_text_lines = []
|
||||
|
||||
texts = result.get('rec_texts', result.get('texts', []))
|
||||
scores = result.get('rec_scores', result.get('scores', []))
|
||||
polys = result.get('dt_polys', result.get('boxes', []))
|
||||
rec_boxes = result.get('rec_boxes', [])
|
||||
|
||||
logger.info(f"PaddleOCR 3.x: {len(texts)} texts, {len(scores)} scores, {len(polys)} polys, {len(rec_boxes)} rec_boxes")
|
||||
|
||||
for i, (text, score) in enumerate(zip(texts, scores)):
|
||||
if not text or not str(text).strip():
|
||||
continue
|
||||
|
||||
x1, y1, x2, y2 = 0, 0, 100, 50
|
||||
|
||||
if i < len(rec_boxes) and rec_boxes[i] is not None:
|
||||
box = rec_boxes[i]
|
||||
try:
|
||||
if hasattr(box, 'flatten'):
|
||||
box = box.flatten().tolist()
|
||||
if len(box) >= 4:
|
||||
x1, y1, x2, y2 = int(box[0]), int(box[1]), int(box[2]), int(box[3])
|
||||
except Exception as e:
|
||||
logger.debug(f"Could not parse rec_box: {e}")
|
||||
|
||||
elif i < len(polys) and polys[i] is not None:
|
||||
poly = polys[i]
|
||||
try:
|
||||
if hasattr(poly, 'tolist'):
|
||||
poly = poly.tolist()
|
||||
if len(poly) >= 4:
|
||||
x_coords = [p[0] for p in poly]
|
||||
y_coords = [p[1] for p in poly]
|
||||
x1, y1 = int(min(x_coords)), int(min(y_coords))
|
||||
x2, y2 = int(max(x_coords)), int(max(y_coords))
|
||||
except Exception as e:
|
||||
logger.debug(f"Could not parse polygon: {e}")
|
||||
|
||||
region = OCRRegion(
|
||||
text=text.strip(), confidence=float(score) if score else 0.5,
|
||||
x1=x1, y1=y1, x2=x2, y2=y2
|
||||
)
|
||||
regions.append(region)
|
||||
all_text_lines.append(text.strip())
|
||||
|
||||
regions.sort(key=lambda r: r.y1)
|
||||
raw_text = "\n".join(all_text_lines)
|
||||
logger.info(f"PaddleOCR 3.x extracted {len(regions)} text regions")
|
||||
return regions, raw_text
|
||||
|
||||
|
||||
def _parse_paddleocr_list(page_result: list) -> Tuple[List[OCRRegion], str]:
|
||||
"""Parse PaddleOCR traditional list format result."""
|
||||
regions = []
|
||||
all_text_lines = []
|
||||
|
||||
for line in page_result:
|
||||
if not line or len(line) < 2:
|
||||
continue
|
||||
|
||||
bbox_points = line[0]
|
||||
text_info = line[1]
|
||||
|
||||
if isinstance(text_info, tuple) and len(text_info) >= 2:
|
||||
text, confidence = text_info[0], text_info[1]
|
||||
elif isinstance(text_info, str):
|
||||
text, confidence = text_info, 0.5
|
||||
else:
|
||||
continue
|
||||
|
||||
if not text or not text.strip():
|
||||
continue
|
||||
|
||||
x_coords = [p[0] for p in bbox_points]
|
||||
y_coords = [p[1] for p in bbox_points]
|
||||
|
||||
region = OCRRegion(
|
||||
text=text.strip(), confidence=float(confidence),
|
||||
x1=int(min(x_coords)), y1=int(min(y_coords)),
|
||||
x2=int(max(x_coords)), y2=int(max(y_coords))
|
||||
)
|
||||
regions.append(region)
|
||||
all_text_lines.append(text.strip())
|
||||
|
||||
regions.sort(key=lambda r: r.y1)
|
||||
raw_text = "\n".join(all_text_lines)
|
||||
logger.info(f"PaddleOCR extracted {len(regions)} text regions")
|
||||
return regions, raw_text
|
||||
|
||||
|
||||
def group_regions_by_rows(regions: List[OCRRegion], y_tolerance: int = 20) -> List[List[OCRRegion]]:
|
||||
"""Gruppiert Textregionen in Zeilen basierend auf Y-Position."""
|
||||
if not regions:
|
||||
return []
|
||||
|
||||
rows = []
|
||||
current_row = [regions[0]]
|
||||
current_y = regions[0].center_y
|
||||
|
||||
for region in regions[1:]:
|
||||
if abs(region.center_y - current_y) <= y_tolerance:
|
||||
current_row.append(region)
|
||||
else:
|
||||
current_row.sort(key=lambda r: r.x1)
|
||||
rows.append(current_row)
|
||||
current_row = [region]
|
||||
current_y = region.center_y
|
||||
|
||||
if current_row:
|
||||
current_row.sort(key=lambda r: r.x1)
|
||||
rows.append(current_row)
|
||||
|
||||
return rows
|
||||
|
||||
|
||||
def detect_columns(rows: List[List[OCRRegion]]) -> int:
|
||||
"""Erkennt die Anzahl der Spalten basierend auf den Textpositionen."""
|
||||
if not rows:
|
||||
return 2
|
||||
|
||||
items_per_row = [len(row) for row in rows if len(row) >= 2]
|
||||
if not items_per_row:
|
||||
return 2
|
||||
|
||||
avg_items = sum(items_per_row) / len(items_per_row)
|
||||
return 3 if avg_items >= 2.5 else 2
|
||||
|
||||
|
||||
def format_ocr_for_llm(regions: List[OCRRegion]) -> str:
|
||||
"""Formatiert OCR-Output fuer LLM-Verarbeitung."""
|
||||
rows = group_regions_by_rows(regions)
|
||||
num_columns = detect_columns(rows)
|
||||
|
||||
lines = [f"Erkannte Spalten: {num_columns}", "---"]
|
||||
for row in rows:
|
||||
if len(row) >= 2:
|
||||
lines.append("\t".join(r.text for r in row))
|
||||
elif len(row) == 1:
|
||||
lines.append(row[0].text)
|
||||
|
||||
return "\n".join(lines)
|
||||
Reference in New Issue
Block a user