[split-required] Split 500-850 LOC files (batch 2)

backend-lehrer (10 files): - game/database.py (785 → 5), correction_api.py (683 → 4) - classroom_engine/antizipation.py (676 → 5) - llm_gateway schools/edu_search already done in prior batch klausur-service (12 files): - orientation_crop_api.py (694 → 5), pdf_export.py (677 → 4) - zeugnis_crawler.py (676 → 5), grid_editor_api.py (671 → 5) - eh_templates.py (658 → 5), mail/api.py (651 → 5) - qdrant_service.py (638 → 5), training_api.py (625 → 4) website (6 pages): - middleware (696 → 8), mail (733 → 6), consent (628 → 8) - compliance/risks (622 → 5), export (502 → 5), brandbook (629 → 7) studio-v2 (3 components): - B2BMigrationWizard (848 → 3), CleanupPanel (765 → 2) - dashboard-experimental (739 → 2) admin-lehrer (4 files): - uebersetzungen (769 → 4), manager (670 → 2) - ChunkBrowserQA (675 → 6), dsfa/page (674 → 5) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-25 08:24:01 +02:00
parent 34da9f4cda
commit b4613e26f3
118 changed files with 15258 additions and 14680 deletions
--- a/klausur-service/backend/grid_editor_api_grid.py
+++ b/klausur-service/backend/grid_editor_api_grid.py
@@ -0,0 +1,337 @@
+"""
+Grid Editor API — grid build, save, and retrieve endpoints.
+"""
+
+import logging
+import time
+from typing import Any, Dict
+
+from fastapi import APIRouter, HTTPException, Query, Request
+
+from grid_build_core import _build_grid_core
+from ocr_pipeline_session_store import (
+    get_session_db,
+    update_session_db,
+)
+from ocr_pipeline_common import (
+    _cache,
+    _load_session_to_cache,
+    _get_cached,
+)
+
+logger = logging.getLogger(__name__)
+
+router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["grid-editor"])
+
+
+@router.post("/sessions/{session_id}/build-grid")
+async def build_grid(
+    session_id: str,
+    ipa_mode: str = Query("auto", pattern="^(auto|all|de|en|none)$"),
+    syllable_mode: str = Query("auto", pattern="^(auto|all|de|en|none)$"),
+    enhance: bool = Query(True, description="Step 3: CLAHE + denoise for degraded scans"),
+    max_cols: int = Query(0, description="Step 2: Max column count (0=unlimited)"),
+    min_conf: int = Query(0, description="Step 1: Min OCR confidence (0=auto)"),
+):
+    """Build a structured, zone-aware grid from existing Kombi word results.
+
+    Requires that paddle-kombi or rapid-kombi has already been run on the session.
+    Uses the image for box detection and the word positions for grid structuring.
+
+    Query params:
+        ipa_mode: "auto" (only when English IPA detected), "all" (force), "none" (skip)
+        syllable_mode: "auto" (only when original has dividers), "all" (force), "none" (skip)
+
+    Returns a StructuredGrid with zones, each containing their own
+    columns, rows, and cells — ready for the frontend Excel-like editor.
+    """
+    session = await get_session_db(session_id)
+    if not session:
+        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
+
+    try:
+        result = await _build_grid_core(
+            session_id, session,
+            ipa_mode=ipa_mode, syllable_mode=syllable_mode,
+            enhance=enhance,
+            max_columns=max_cols if max_cols > 0 else None,
+            min_conf=min_conf if min_conf > 0 else None,
+        )
+    except ValueError as e:
+        raise HTTPException(status_code=400, detail=str(e))
+
+    # Save automatic grid snapshot for later comparison with manual corrections
+    # Lazy import to avoid circular dependency with ocr_pipeline_regression
+    from ocr_pipeline_regression import _build_reference_snapshot
+
+    wr = session.get("word_result") or {}
+    engine = wr.get("ocr_engine", "")
+    if engine in ("kombi", "rapid_kombi"):
+        auto_pipeline = "kombi"
+    elif engine == "paddle_direct":
+        auto_pipeline = "paddle-direct"
+    else:
+        auto_pipeline = "pipeline"
+    auto_snapshot = _build_reference_snapshot(result, pipeline=auto_pipeline)
+
+    gt = session.get("ground_truth") or {}
+    gt["auto_grid_snapshot"] = auto_snapshot
+
+    # Persist to DB and advance current_step to 11 (reconstruction complete)
+    await update_session_db(session_id, grid_editor_result=result, ground_truth=gt, current_step=11)
+
+    logger.info(
+        "build-grid session %s: %d zones, %d cols, %d rows, %d cells, "
+        "%d boxes in %.2fs",
+        session_id,
+        len(result.get("zones", [])),
+        result.get("summary", {}).get("total_columns", 0),
+        result.get("summary", {}).get("total_rows", 0),
+        result.get("summary", {}).get("total_cells", 0),
+        result.get("boxes_detected", 0),
+        result.get("duration_seconds", 0),
+    )
+
+    return result
+
+
+@router.post("/sessions/{session_id}/rerun-ocr-and-build-grid")
+async def rerun_ocr_and_build_grid(
+    session_id: str,
+    ipa_mode: str = Query("auto", pattern="^(auto|all|de|en|none)$"),
+    syllable_mode: str = Query("auto", pattern="^(auto|all|de|en|none)$"),
+    enhance: bool = Query(True, description="Step 3: CLAHE + denoise for degraded scans"),
+    max_cols: int = Query(0, description="Step 2: Max column count (0=unlimited)"),
+    min_conf: int = Query(0, description="Step 1: Min OCR confidence (0=auto)"),
+    vision_fusion: bool = Query(False, description="Step 4: Vision-LLM fusion for degraded scans"),
+    doc_category: str = Query("", description="Document type for Vision-LLM prompt context"),
+):
+    """Re-run OCR with quality settings, then rebuild the grid.
+
+    Unlike build-grid (which only rebuilds from existing words),
+    this endpoint re-runs the full OCR pipeline on the cropped image
+    with optional CLAHE enhancement, then builds the grid.
+
+    Steps executed: Image Enhancement -> OCR -> Grid Build
+    """
+    session = await get_session_db(session_id)
+    if not session:
+        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
+
+    import time as _time
+    t0 = _time.time()
+
+    # 1. Load the cropped/dewarped image from cache or session
+    if session_id not in _cache:
+        await _load_session_to_cache(session_id)
+    cached = _get_cached(session_id)
+
+    dewarped_bgr = cached.get("cropped_bgr") if cached.get("cropped_bgr") is not None else cached.get("dewarped_bgr")
+    if dewarped_bgr is None:
+        raise HTTPException(status_code=400, detail="No cropped/dewarped image available. Run preprocessing steps first.")
+
+    import numpy as np
+    img_h, img_w = dewarped_bgr.shape[:2]
+    ocr_input = dewarped_bgr.copy()
+
+    # 2. Scan quality assessment
+    scan_quality_info = {}
+    try:
+        from scan_quality import score_scan_quality
+        quality_report = score_scan_quality(ocr_input)
+        scan_quality_info = quality_report.to_dict()
+        actual_min_conf = min_conf if min_conf > 0 else quality_report.recommended_min_conf
+    except Exception as e:
+        logger.warning(f"rerun-ocr: scan quality failed: {e}")
+        actual_min_conf = min_conf if min_conf > 0 else 40
+
+    # 3. Image enhancement (Step 3)
+    is_degraded = scan_quality_info.get("is_degraded", False)
+    if enhance and is_degraded:
+        try:
+            from ocr_image_enhance import enhance_for_ocr
+            ocr_input = enhance_for_ocr(ocr_input, is_degraded=True)
+            logger.info("rerun-ocr: CLAHE enhancement applied")
+        except Exception as e:
+            logger.warning(f"rerun-ocr: enhancement failed: {e}")
+
+    # 4. Run dual-engine OCR
+    from PIL import Image
+    import pytesseract
+
+    # RapidOCR
+    rapid_words = []
+    try:
+        from cv_ocr_engines import ocr_region_rapid
+        from cv_vocab_types import PageRegion
+        full_region = PageRegion(type="full_page", x=0, y=0, width=img_w, height=img_h)
+        rapid_words = ocr_region_rapid(ocr_input, full_region) or []
+    except Exception as e:
+        logger.warning(f"rerun-ocr: RapidOCR failed: {e}")
+
+    # Tesseract
+    pil_img = Image.fromarray(ocr_input[:, :, ::-1])
+    data = pytesseract.image_to_data(pil_img, lang='eng+deu', config='--psm 6 --oem 3', output_type=pytesseract.Output.DICT)
+    tess_words = []
+    for i in range(len(data["text"])):
+        text = (data["text"][i] or "").strip()
+        conf_raw = str(data["conf"][i])
+        conf = int(conf_raw) if conf_raw.lstrip("-").isdigit() else -1
+        if not text or conf < actual_min_conf:
+            continue
+        tess_words.append({
+            "text": text, "left": data["left"][i], "top": data["top"][i],
+            "width": data["width"][i], "height": data["height"][i], "conf": conf,
+        })
+
+    # 5. Merge OCR results
+    from ocr_pipeline_ocr_merge import _split_paddle_multi_words, _merge_paddle_tesseract, _deduplicate_words
+    rapid_split = _split_paddle_multi_words(rapid_words) if rapid_words else []
+    if rapid_split or tess_words:
+        merged_words = _merge_paddle_tesseract(rapid_split, tess_words)
+        merged_words = _deduplicate_words(merged_words)
+    else:
+        merged_words = tess_words
+
+    # 6. Store updated word_result in session
+    cells_for_storage = [{"text": w["text"], "left": w["left"], "top": w["top"],
+                          "width": w["width"], "height": w["height"], "conf": w.get("conf", 0)}
+                         for w in merged_words]
+    word_result = {
+        "cells": [{"text": " ".join(w["text"] for w in merged_words),
+                    "word_boxes": cells_for_storage}],
+        "image_width": img_w,
+        "image_height": img_h,
+        "ocr_engine": "rapid_kombi",
+        "word_count": len(merged_words),
+        "raw_paddle_words": rapid_words,
+    }
+    # 6b. Vision-LLM Fusion (Step 4) — correct OCR using Vision model
+    vision_applied = False
+    if vision_fusion:
+        try:
+            from vision_ocr_fusion import vision_fuse_ocr
+            category = doc_category or session.get("document_category") or "vokabelseite"
+            logger.info(f"rerun-ocr: running Vision-LLM fusion (category={category})")
+            merged_words = await vision_fuse_ocr(ocr_input, merged_words, category)
+            vision_applied = True
+            # Rebuild storage from fused words
+            cells_for_storage = [{"text": w["text"], "left": w["left"], "top": w["top"],
+                                  "width": w["width"], "height": w["height"], "conf": w.get("conf", 0)}
+                                 for w in merged_words]
+            word_result["cells"] = [{"text": " ".join(w["text"] for w in merged_words),
+                                     "word_boxes": cells_for_storage}]
+            word_result["word_count"] = len(merged_words)
+            word_result["ocr_engine"] = "vision_fusion"
+        except Exception as e:
+            logger.warning(f"rerun-ocr: Vision-LLM fusion failed: {e}")
+
+    await update_session_db(session_id, word_result=word_result)
+
+    # Reload session with updated word_result
+    session = await get_session_db(session_id)
+
+    ocr_duration = _time.time() - t0
+    logger.info(
+        "rerun-ocr session %s: %d words (rapid=%d, tess=%d, merged=%d) in %.1fs "
+        "(enhance=%s, min_conf=%d, quality=%s)",
+        session_id, len(merged_words), len(rapid_words), len(tess_words),
+        len(merged_words), ocr_duration, enhance, actual_min_conf,
+        scan_quality_info.get("quality_pct", "?"),
+    )
+
+    # 7. Build grid from new words
+    try:
+        result = await _build_grid_core(
+            session_id, session,
+            ipa_mode=ipa_mode, syllable_mode=syllable_mode,
+            enhance=enhance,
+            max_columns=max_cols if max_cols > 0 else None,
+            min_conf=min_conf if min_conf > 0 else None,
+        )
+    except ValueError as e:
+        raise HTTPException(status_code=400, detail=str(e))
+
+    # Persist grid
+    await update_session_db(session_id, grid_editor_result=result, current_step=11)
+
+    # Add quality info to response
+    result["scan_quality"] = scan_quality_info
+    result["ocr_stats"] = {
+        "rapid_words": len(rapid_words),
+        "tess_words": len(tess_words),
+        "merged_words": len(merged_words),
+        "min_conf_used": actual_min_conf,
+        "enhance_applied": enhance and is_degraded,
+        "vision_fusion_applied": vision_applied,
+        "document_category": doc_category or session.get("document_category", ""),
+        "ocr_duration_seconds": round(ocr_duration, 1),
+    }
+
+    total_duration = _time.time() - t0
+    logger.info(
+        "rerun-ocr+build-grid session %s: %d zones, %d cols, %d cells in %.1fs",
+        session_id,
+        len(result.get("zones", [])),
+        result.get("summary", {}).get("total_columns", 0),
+        result.get("summary", {}).get("total_cells", 0),
+        total_duration,
+    )
+
+    return result
+
+
+@router.post("/sessions/{session_id}/save-grid")
+async def save_grid(session_id: str, request: Request):
+    """Save edited grid data from the frontend Excel-like editor.
+
+    Receives the full StructuredGrid with user edits (text changes,
+    formatting changes like bold columns, header rows, etc.) and
+    persists it to the session's grid_editor_result.
+    """
+    session = await get_session_db(session_id)
+    if not session:
+        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
+
+    body = await request.json()
+
+    # Validate basic structure
+    if "zones" not in body:
+        raise HTTPException(status_code=400, detail="Missing 'zones' in request body")
+
+    # Preserve metadata from the original build
+    existing = session.get("grid_editor_result") or {}
+    result = {
+        "session_id": session_id,
+        "image_width": body.get("image_width", existing.get("image_width", 0)),
+        "image_height": body.get("image_height", existing.get("image_height", 0)),
+        "zones": body["zones"],
+        "boxes_detected": body.get("boxes_detected", existing.get("boxes_detected", 0)),
+        "summary": body.get("summary", existing.get("summary", {})),
+        "formatting": body.get("formatting", existing.get("formatting", {})),
+        "duration_seconds": existing.get("duration_seconds", 0),
+        "edited": True,
+    }
+
+    await update_session_db(session_id, grid_editor_result=result, current_step=11)
+
+    logger.info("save-grid session %s: %d zones saved", session_id, len(body["zones"]))
+
+    return {"session_id": session_id, "saved": True}
+
+
+@router.get("/sessions/{session_id}/grid-editor")
+async def get_grid(session_id: str):
+    """Retrieve the current grid editor state for a session."""
+    session = await get_session_db(session_id)
+    if not session:
+        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
+
+    result = session.get("grid_editor_result")
+    if not result:
+        raise HTTPException(
+            status_code=404,
+            detail="No grid editor data. Run build-grid first.",
+        )
+
+    return result