feat(ocr-pipeline): add LLM-based OCR correction step (Step 6)

Replace the placeholder "Koordinaten" step with an LLM review step that
sends vocab entries to qwen3:30b-a3b via Ollama for OCR error correction
(e.g. "8en" → "Ben"). Teachers can review, accept/reject individual
corrections in a diff table before applying them.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-02 11:13:17 +01:00
parent e9f368d3ec
commit 938d1d69cf
5 changed files with 586 additions and 5 deletions

View File

@@ -7,7 +7,7 @@ Zerlegt den OCR-Prozess in 8 einzelne Schritte:
3. Spaltenerkennung - Unsichtbare Spalten finden
4. Zeilenerkennung - Horizontale Zeilen + Kopf-/Fusszeilen
5. Worterkennung - OCR mit Bounding Boxes
6. Koordinatenzuweisung - Exakte Positionen
6. LLM-Korrektur - OCR-Fehler per LLM korrigieren
7. Seitenrekonstruktion - Seite nachbauen
8. Ground Truth Validierung - Gesamtpruefung
@@ -30,6 +30,7 @@ from fastapi.responses import Response, StreamingResponse
from pydantic import BaseModel
from cv_vocab_pipeline import (
OLLAMA_REVIEW_MODEL,
PageRegion,
RowGeometry,
_cells_to_vocab_entries,
@@ -49,6 +50,7 @@ from cv_vocab_pipeline import (
detect_row_geometry,
dewarp_image,
dewarp_image_manual,
llm_review_entries,
render_image_high_res,
render_pdf_high_res,
)
@@ -1387,6 +1389,124 @@ async def get_word_ground_truth(session_id: str):
}
# ---------------------------------------------------------------------------
# LLM Review Endpoints (Step 6)
# ---------------------------------------------------------------------------
@router.post("/sessions/{session_id}/llm-review")
async def run_llm_review(session_id: str, request: Request):
"""Run LLM-based correction on vocab entries from Step 5."""
session = await get_session_db(session_id)
if not session:
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
word_result = session.get("word_result")
if not word_result:
raise HTTPException(status_code=400, detail="No word result found — run Step 5 first")
entries = word_result.get("vocab_entries") or word_result.get("entries") or []
if not entries:
raise HTTPException(status_code=400, detail="No vocab entries found — run Step 5 first")
# Optional model override from request body
body = {}
try:
body = await request.json()
except Exception:
pass
model = body.get("model") or OLLAMA_REVIEW_MODEL
try:
result = await llm_review_entries(entries, model=model)
except Exception as e:
logger.error(f"LLM review failed for session {session_id}: {e}")
raise HTTPException(status_code=502, detail=f"LLM review failed: {e}")
# Store result inside word_result as a sub-key
word_result["llm_review"] = {
"changes": result["changes"],
"model_used": result["model_used"],
"duration_ms": result["duration_ms"],
"entries_corrected": result["entries_corrected"],
}
await update_session_db(session_id, word_result=word_result, current_step=6)
if session_id in _cache:
_cache[session_id]["word_result"] = word_result
logger.info(f"LLM review session {session_id}: {len(result['changes'])} changes, "
f"{result['duration_ms']}ms, model={result['model_used']}")
return {
"session_id": session_id,
"changes": result["changes"],
"model_used": result["model_used"],
"duration_ms": result["duration_ms"],
"total_entries": len(entries),
"corrections_found": len(result["changes"]),
}
@router.post("/sessions/{session_id}/llm-review/apply")
async def apply_llm_corrections(session_id: str, request: Request):
"""Apply selected LLM corrections to vocab entries."""
session = await get_session_db(session_id)
if not session:
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
word_result = session.get("word_result")
if not word_result:
raise HTTPException(status_code=400, detail="No word result found")
llm_review = word_result.get("llm_review")
if not llm_review:
raise HTTPException(status_code=400, detail="No LLM review found — run /llm-review first")
body = await request.json()
accepted_indices = set(body.get("accepted_indices", [])) # indices into changes[]
changes = llm_review.get("changes", [])
entries = word_result.get("vocab_entries") or word_result.get("entries") or []
# Build a lookup: (row_index, field) -> new_value for accepted changes
corrections = {}
applied_count = 0
for idx, change in enumerate(changes):
if idx in accepted_indices:
key = (change["row_index"], change["field"])
corrections[key] = change["new"]
applied_count += 1
# Apply corrections to entries
for entry in entries:
row_idx = entry.get("row_index", -1)
for field_name in ("english", "german", "example"):
key = (row_idx, field_name)
if key in corrections:
entry[field_name] = corrections[key]
entry["llm_corrected"] = True
# Update word_result
word_result["vocab_entries"] = entries
word_result["entries"] = entries
word_result["llm_review"]["applied_count"] = applied_count
word_result["llm_review"]["applied_at"] = datetime.utcnow().isoformat()
await update_session_db(session_id, word_result=word_result)
if session_id in _cache:
_cache[session_id]["word_result"] = word_result
logger.info(f"Applied {applied_count}/{len(changes)} LLM corrections for session {session_id}")
return {
"session_id": session_id,
"applied_count": applied_count,
"total_changes": len(changes),
}
async def _get_rows_overlay(session_id: str) -> Response:
"""Generate dewarped image with row bands drawn on it."""
session = await get_session_db(session_id)