Add gutter repair step to OCR Kombi pipeline

New step "Wortkorrektur" between Grid-Review and Ground Truth that detects and fixes words truncated or blurred at the book gutter (binding area) of double-page scans. Uses pyspellchecker (DE+EN) for validation. Two repair strategies: - hyphen_join: words split across rows with missing chars (ve + künden → verkünden) - spell_fix: garbled trailing chars from gutter blur (stammeli → stammeln) Interactive frontend with per-suggestion accept/reject and batch controls. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-10 18:50:16 +02:00
parent 21b69e06be
commit 71e1b10ac7
7 changed files with 1376 additions and 3 deletions
--- a/klausur-service/backend/grid_editor_api.py
+++ b/klausur-service/backend/grid_editor_api.py
@@ -1851,3 +1851,90 @@ async def get_grid(session_id: str):
        )

    return result
+
+
+# ---------------------------------------------------------------------------
+# Gutter Repair endpoints
+# ---------------------------------------------------------------------------
+
+@router.post("/sessions/{session_id}/gutter-repair")
+async def gutter_repair(session_id: str):
+    """Analyse grid for gutter-edge OCR errors and return repair suggestions.
+
+    Detects:
+      - Words truncated/blurred at the book binding (spell_fix)
+      - Words split across rows with missing hyphen chars (hyphen_join)
+    """
+    session = await get_session_db(session_id)
+    if not session:
+        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
+
+    grid_data = session.get("grid_editor_result")
+    if not grid_data:
+        raise HTTPException(
+            status_code=400,
+            detail="No grid data. Run build-grid first.",
+        )
+
+    from cv_gutter_repair import analyse_grid_for_gutter_repair
+
+    image_width = grid_data.get("image_width", 0)
+    result = analyse_grid_for_gutter_repair(grid_data, image_width=image_width)
+
+    # Persist suggestions in ground_truth.gutter_repair (avoids DB migration)
+    gt = session.get("ground_truth") or {}
+    gt["gutter_repair"] = result
+    await update_session_db(session_id, ground_truth=gt)
+
+    logger.info(
+        "gutter-repair session %s: %d suggestions in %.2fs",
+        session_id,
+        result.get("stats", {}).get("suggestions_found", 0),
+        result.get("duration_seconds", 0),
+    )
+
+    return result
+
+
+@router.post("/sessions/{session_id}/gutter-repair/apply")
+async def gutter_repair_apply(session_id: str, request: Request):
+    """Apply accepted gutter repair suggestions to the grid.
+
+    Body: { "accepted": ["suggestion_id_1", "suggestion_id_2", ...] }
+    """
+    session = await get_session_db(session_id)
+    if not session:
+        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
+
+    grid_data = session.get("grid_editor_result")
+    if not grid_data:
+        raise HTTPException(status_code=400, detail="No grid data.")
+
+    gt = session.get("ground_truth") or {}
+    gutter_result = gt.get("gutter_repair")
+    if not gutter_result:
+        raise HTTPException(
+            status_code=400,
+            detail="No gutter repair data. Run gutter-repair first.",
+        )
+
+    body = await request.json()
+    accepted_ids = body.get("accepted", [])
+    if not accepted_ids:
+        return {"applied_count": 0, "changes": []}
+
+    from cv_gutter_repair import apply_gutter_suggestions
+
+    suggestions = gutter_result.get("suggestions", [])
+    result = apply_gutter_suggestions(grid_data, accepted_ids, suggestions)
+
+    # Save updated grid back to session
+    await update_session_db(session_id, grid_editor_result=grid_data)
+
+    logger.info(
+        "gutter-repair/apply session %s: %d changes applied",
+        session_id,
+        result.get("applied_count", 0),
+    )
+
+    return result