Add gutter repair step to OCR Kombi pipeline
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 41s
CI / test-go-edu-search (push) Successful in 36s
CI / test-python-klausur (push) Failing after 2m31s
CI / test-python-agent-core (push) Successful in 28s
CI / test-nodejs-website (push) Successful in 29s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 41s
CI / test-go-edu-search (push) Successful in 36s
CI / test-python-klausur (push) Failing after 2m31s
CI / test-python-agent-core (push) Successful in 28s
CI / test-nodejs-website (push) Successful in 29s
New step "Wortkorrektur" between Grid-Review and Ground Truth that detects and fixes words truncated or blurred at the book gutter (binding area) of double-page scans. Uses pyspellchecker (DE+EN) for validation. Two repair strategies: - hyphen_join: words split across rows with missing chars (ve + künden → verkünden) - spell_fix: garbled trailing chars from gutter blur (stammeli → stammeln) Interactive frontend with per-suggestion accept/reject and batch controls. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1851,3 +1851,90 @@ async def get_grid(session_id: str):
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Gutter Repair endpoints
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@router.post("/sessions/{session_id}/gutter-repair")
|
||||
async def gutter_repair(session_id: str):
|
||||
"""Analyse grid for gutter-edge OCR errors and return repair suggestions.
|
||||
|
||||
Detects:
|
||||
- Words truncated/blurred at the book binding (spell_fix)
|
||||
- Words split across rows with missing hyphen chars (hyphen_join)
|
||||
"""
|
||||
session = await get_session_db(session_id)
|
||||
if not session:
|
||||
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
|
||||
|
||||
grid_data = session.get("grid_editor_result")
|
||||
if not grid_data:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="No grid data. Run build-grid first.",
|
||||
)
|
||||
|
||||
from cv_gutter_repair import analyse_grid_for_gutter_repair
|
||||
|
||||
image_width = grid_data.get("image_width", 0)
|
||||
result = analyse_grid_for_gutter_repair(grid_data, image_width=image_width)
|
||||
|
||||
# Persist suggestions in ground_truth.gutter_repair (avoids DB migration)
|
||||
gt = session.get("ground_truth") or {}
|
||||
gt["gutter_repair"] = result
|
||||
await update_session_db(session_id, ground_truth=gt)
|
||||
|
||||
logger.info(
|
||||
"gutter-repair session %s: %d suggestions in %.2fs",
|
||||
session_id,
|
||||
result.get("stats", {}).get("suggestions_found", 0),
|
||||
result.get("duration_seconds", 0),
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
@router.post("/sessions/{session_id}/gutter-repair/apply")
|
||||
async def gutter_repair_apply(session_id: str, request: Request):
|
||||
"""Apply accepted gutter repair suggestions to the grid.
|
||||
|
||||
Body: { "accepted": ["suggestion_id_1", "suggestion_id_2", ...] }
|
||||
"""
|
||||
session = await get_session_db(session_id)
|
||||
if not session:
|
||||
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
|
||||
|
||||
grid_data = session.get("grid_editor_result")
|
||||
if not grid_data:
|
||||
raise HTTPException(status_code=400, detail="No grid data.")
|
||||
|
||||
gt = session.get("ground_truth") or {}
|
||||
gutter_result = gt.get("gutter_repair")
|
||||
if not gutter_result:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="No gutter repair data. Run gutter-repair first.",
|
||||
)
|
||||
|
||||
body = await request.json()
|
||||
accepted_ids = body.get("accepted", [])
|
||||
if not accepted_ids:
|
||||
return {"applied_count": 0, "changes": []}
|
||||
|
||||
from cv_gutter_repair import apply_gutter_suggestions
|
||||
|
||||
suggestions = gutter_result.get("suggestions", [])
|
||||
result = apply_gutter_suggestions(grid_data, accepted_ids, suggestions)
|
||||
|
||||
# Save updated grid back to session
|
||||
await update_session_db(session_id, grid_editor_result=grid_data)
|
||||
|
||||
logger.info(
|
||||
"gutter-repair/apply session %s: %d changes applied",
|
||||
session_id,
|
||||
result.get("applied_count", 0),
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
Reference in New Issue
Block a user