[split-required] Split 500-850 LOC files (batch 2)

backend-lehrer (10 files): - game/database.py (785 → 5), correction_api.py (683 → 4) - classroom_engine/antizipation.py (676 → 5) - llm_gateway schools/edu_search already done in prior batch klausur-service (12 files): - orientation_crop_api.py (694 → 5), pdf_export.py (677 → 4) - zeugnis_crawler.py (676 → 5), grid_editor_api.py (671 → 5) - eh_templates.py (658 → 5), mail/api.py (651 → 5) - qdrant_service.py (638 → 5), training_api.py (625 → 4) website (6 pages): - middleware (696 → 8), mail (733 → 6), consent (628 → 8) - compliance/risks (622 → 5), export (502 → 5), brandbook (629 → 7) studio-v2 (3 components): - B2BMigrationWizard (848 → 3), CleanupPanel (765 → 2) - dashboard-experimental (739 → 2) admin-lehrer (4 files): - uebersetzungen (769 → 4), manager (670 → 2) - ChunkBrowserQA (675 → 6), dsfa/page (674 → 5) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-25 08:24:01 +02:00
parent 34da9f4cda
commit b4613e26f3
118 changed files with 15258 additions and 14680 deletions
--- a/klausur-service/backend/grid_editor_api_box.py
+++ b/klausur-service/backend/grid_editor_api_box.py
@@ -0,0 +1,177 @@
+"""
+Grid Editor API — box-grid-review endpoints.
+"""
+
+import logging
+
+from fastapi import APIRouter, HTTPException, Request
+
+from grid_editor_helpers import _words_in_zone
+from ocr_pipeline_session_store import (
+    get_session_db,
+    update_session_db,
+)
+
+logger = logging.getLogger(__name__)
+
+router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["grid-editor"])
+
+
+@router.post("/sessions/{session_id}/build-box-grids")
+async def build_box_grids(session_id: str, request: Request):
+    """Rebuild grid structure for all detected boxes with layout-aware detection.
+
+    Uses structure_result.boxes (from Step 7) as the source of box coordinates,
+    and raw_paddle_words as OCR word source. Creates or updates box zones in
+    the grid_editor_result.
+
+    Optional body: { "overrides": { "0": "bullet_list" } }
+    Maps box_index -> forced layout_type.
+    """
+    session = await get_session_db(session_id)
+    if not session:
+        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
+
+    grid_data = session.get("grid_editor_result")
+    if not grid_data:
+        raise HTTPException(status_code=400, detail="No grid data. Run build-grid first.")
+
+    # Get raw OCR words (with top/left/width/height keys)
+    word_result = session.get("word_result") or {}
+    all_words = word_result.get("raw_paddle_words") or word_result.get("raw_tesseract_words") or []
+    if not all_words:
+        raise HTTPException(status_code=400, detail="No raw OCR words available.")
+
+    # Get detected boxes from structure_result
+    structure_result = session.get("structure_result") or {}
+    gt = session.get("ground_truth") or {}
+    if not structure_result:
+        structure_result = gt.get("structure_result") or {}
+    detected_boxes = structure_result.get("boxes") or []
+    if not detected_boxes:
+        return {"session_id": session_id, "box_zones_rebuilt": 0, "spell_fixes": 0, "message": "No boxes detected"}
+
+    # Filter out false-positive boxes in header/footer margins.
+    img_h_for_filter = grid_data.get("image_height", 0) or word_result.get("image_height", 0)
+    if img_h_for_filter > 0:
+        margin_frac = 0.07  # 7% of image height
+        margin_top = img_h_for_filter * margin_frac
+        margin_bottom = img_h_for_filter * (1 - margin_frac)
+        filtered = []
+        for box in detected_boxes:
+            by = box.get("y", 0)
+            bh = box.get("h", 0)
+            box_center_y = by + bh / 2
+            if box_center_y < margin_top or box_center_y > margin_bottom:
+                logger.info("build-box-grids: skipping header/footer box at y=%d h=%d (center=%.0f, margins=%.0f/%.0f)",
+                            by, bh, box_center_y, margin_top, margin_bottom)
+                continue
+            filtered.append(box)
+        detected_boxes = filtered
+
+    body = {}
+    try:
+        body = await request.json()
+    except Exception:
+        pass
+    layout_overrides = body.get("overrides", {})
+
+    from cv_box_layout import build_box_zone_grid
+
+    img_w = grid_data.get("image_width", 0) or word_result.get("image_width", 0)
+    img_h = grid_data.get("image_height", 0) or word_result.get("image_height", 0)
+
+    zones = grid_data.get("zones", [])
+
+    # Find highest existing zone_index
+    max_zone_idx = max((z.get("zone_index", 0) for z in zones), default=-1)
+
+    # Remove old box zones (we'll rebuild them)
+    zones = [z for z in zones if z.get("zone_type") != "box"]
+
+    box_count = 0
+    spell_fixes = 0
+
+    for box_idx, box in enumerate(detected_boxes):
+        bx = box.get("x", 0)
+        by = box.get("y", 0)
+        bw = box.get("w", 0)
+        bh = box.get("h", 0)
+
+        if bw <= 0 or bh <= 0:
+            continue
+
+        # Filter raw OCR words inside this box
+        zone_words = _words_in_zone(all_words, by, bh, bx, bw)
+        if not zone_words:
+            logger.info("Box %d: no words found in bbox (%d,%d,%d,%d)", box_idx, bx, by, bw, bh)
+            continue
+
+        zone_idx = max_zone_idx + 1 + box_idx
+        forced_layout = layout_overrides.get(str(box_idx))
+
+        # Build box grid
+        box_grid = build_box_zone_grid(
+            zone_words, bx, by, bw, bh,
+            zone_idx, img_w, img_h,
+            layout_type=forced_layout,
+        )
+
+        # Apply SmartSpellChecker to all box cells
+        try:
+            from smart_spell import SmartSpellChecker
+            ssc = SmartSpellChecker()
+            for cell in box_grid.get("cells", []):
+                text = cell.get("text", "")
+                if not text:
+                    continue
+                result = ssc.correct_text(text, lang="auto")
+                if result.changed:
+                    cell["text"] = result.corrected
+                    spell_fixes += 1
+        except ImportError:
+            pass
+
+        # Build zone entry
+        zone_entry = {
+            "zone_index": zone_idx,
+            "zone_type": "box",
+            "bbox_px": {"x": bx, "y": by, "w": bw, "h": bh},
+            "bbox_pct": {
+                "x": round(bx / img_w * 100, 2) if img_w else 0,
+                "y": round(by / img_h * 100, 2) if img_h else 0,
+                "w": round(bw / img_w * 100, 2) if img_w else 0,
+                "h": round(bh / img_h * 100, 2) if img_h else 0,
+            },
+            "border": None,
+            "word_count": len(zone_words),
+            "columns": box_grid["columns"],
+            "rows": box_grid["rows"],
+            "cells": box_grid["cells"],
+            "header_rows": box_grid.get("header_rows", []),
+            "box_layout_type": box_grid.get("box_layout_type", "flowing"),
+            "box_grid_reviewed": False,
+            "box_bg_color": box.get("bg_color_name", ""),
+            "box_bg_hex": box.get("bg_color_hex", ""),
+        }
+        zones.append(zone_entry)
+        box_count += 1
+
+    # Sort zones by y-position for correct reading order
+    zones.sort(key=lambda z: z.get("bbox_px", {}).get("y", 0))
+
+    grid_data["zones"] = zones
+    await update_session_db(session_id, grid_editor_result=grid_data)
+
+    logger.info(
+        "build-box-grids session %s: %d boxes processed (%d words spell-fixed) from %d detected",
+        session_id, box_count, spell_fixes, len(detected_boxes),
+    )
+
+    return {
+        "session_id": session_id,
+        "box_zones_rebuilt": box_count,
+        "total_detected_boxes": len(detected_boxes),
+        "spell_fixes": spell_fixes,
+        "zones": zones,
+    }