feat(ocr-pipeline): add LLM-based OCR correction step (Step 6)
Replace the placeholder "Koordinaten" step with an LLM review step that sends vocab entries to qwen3:30b-a3b via Ollama for OCR error correction (e.g. "8en" → "Ben"). Teachers can review, accept/reject individual corrections in a diff table before applying them. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -4304,3 +4304,119 @@ async def run_cv_pipeline(
|
||||
result.duration_seconds = round(time.time() - total_start, 2)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# LLM-based OCR Correction (Step 6)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
import httpx
|
||||
import os
|
||||
import json as _json
|
||||
import re as _re
|
||||
|
||||
_OLLAMA_URL = os.getenv("OLLAMA_URL", os.getenv("OLLAMA_BASE_URL", "http://host.docker.internal:11434"))
|
||||
OLLAMA_REVIEW_MODEL = os.getenv("OLLAMA_REVIEW_MODEL", "qwen3:30b-a3b")
|
||||
|
||||
|
||||
async def llm_review_entries(
|
||||
entries: List[Dict],
|
||||
model: str = None,
|
||||
) -> Dict:
|
||||
"""Send vocab entries to a local LLM for OCR error correction."""
|
||||
model = model or OLLAMA_REVIEW_MODEL
|
||||
|
||||
# Build a compact table representation for the prompt
|
||||
table_lines = []
|
||||
for e in entries:
|
||||
table_lines.append({
|
||||
"row": e.get("row_index", 0),
|
||||
"en": e.get("english", ""),
|
||||
"de": e.get("german", ""),
|
||||
"ex": e.get("example", ""),
|
||||
})
|
||||
|
||||
prompt = f"""Du bist ein Korrekturleser fuer OCR-erkannte Vokabeltabellen (Englisch-Deutsch).
|
||||
Die Tabelle wurde per OCR aus einem Schulbuch-Scan extrahiert. Korrigiere NUR offensichtliche OCR-Fehler.
|
||||
|
||||
Haeufige OCR-Fehler die du korrigieren sollst:
|
||||
- Ziffern statt Buchstaben: 8→B, 0→O, 1→l/I, 5→S, 6→G
|
||||
- Fehlende oder falsche Satzzeichen
|
||||
- Offensichtliche Tippfehler die durch OCR entstanden sind
|
||||
|
||||
WICHTIG:
|
||||
- Aendere NICHTS was korrekt aussieht
|
||||
- Erfinde KEINE neuen Woerter oder Uebersetzungen
|
||||
- Behalte Abkuerzungen wie sth., sb., etc. bei
|
||||
- Behalte die exakte Struktur (gleiche Anzahl Eintraege)
|
||||
|
||||
Antworte NUR mit dem korrigierten JSON-Array. Kein erklaerener Text.
|
||||
Fuer jeden Eintrag den du aenderst, setze "corrected": true.
|
||||
Fuer unveraenderte Eintraege setze "corrected": false.
|
||||
|
||||
Eingabe:
|
||||
{_json.dumps(table_lines, ensure_ascii=False, indent=2)}"""
|
||||
|
||||
t0 = time.time()
|
||||
async with httpx.AsyncClient(timeout=120.0) as client:
|
||||
resp = await client.post(
|
||||
f"{_OLLAMA_URL}/api/chat",
|
||||
json={
|
||||
"model": model,
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"stream": False,
|
||||
"options": {"temperature": 0.1, "num_predict": 8192},
|
||||
},
|
||||
)
|
||||
resp.raise_for_status()
|
||||
content = resp.json().get("message", {}).get("content", "")
|
||||
duration_ms = int((time.time() - t0) * 1000)
|
||||
|
||||
# Parse LLM response — extract JSON array
|
||||
corrected = _parse_llm_json_array(content)
|
||||
|
||||
# Build diff: compare original vs corrected
|
||||
changes = []
|
||||
entries_corrected = []
|
||||
for i, orig in enumerate(entries):
|
||||
if i < len(corrected):
|
||||
c = corrected[i]
|
||||
entry = dict(orig)
|
||||
for field_name, key in [("english", "en"), ("german", "de"), ("example", "ex")]:
|
||||
new_val = c.get(key, "").strip()
|
||||
old_val = (orig.get(field_name, "") or "").strip()
|
||||
if new_val and new_val != old_val:
|
||||
changes.append({
|
||||
"row_index": orig.get("row_index", i),
|
||||
"field": field_name,
|
||||
"old": old_val,
|
||||
"new": new_val,
|
||||
})
|
||||
entry[field_name] = new_val
|
||||
entry["llm_corrected"] = True
|
||||
entries_corrected.append(entry)
|
||||
else:
|
||||
entries_corrected.append(dict(orig))
|
||||
|
||||
return {
|
||||
"entries_original": entries,
|
||||
"entries_corrected": entries_corrected,
|
||||
"changes": changes,
|
||||
"model_used": model,
|
||||
"duration_ms": duration_ms,
|
||||
}
|
||||
|
||||
|
||||
def _parse_llm_json_array(text: str) -> List[Dict]:
|
||||
"""Extract JSON array from LLM response (may contain markdown fences)."""
|
||||
# Strip markdown code fences
|
||||
text = _re.sub(r'```json\s*', '', text)
|
||||
text = _re.sub(r'```\s*', '', text)
|
||||
# Find array
|
||||
match = _re.search(r'\[.*\]', text, _re.DOTALL)
|
||||
if match:
|
||||
try:
|
||||
return _json.loads(match.group())
|
||||
except (ValueError, _json.JSONDecodeError):
|
||||
pass
|
||||
return []
|
||||
|
||||
@@ -7,7 +7,7 @@ Zerlegt den OCR-Prozess in 8 einzelne Schritte:
|
||||
3. Spaltenerkennung - Unsichtbare Spalten finden
|
||||
4. Zeilenerkennung - Horizontale Zeilen + Kopf-/Fusszeilen
|
||||
5. Worterkennung - OCR mit Bounding Boxes
|
||||
6. Koordinatenzuweisung - Exakte Positionen
|
||||
6. LLM-Korrektur - OCR-Fehler per LLM korrigieren
|
||||
7. Seitenrekonstruktion - Seite nachbauen
|
||||
8. Ground Truth Validierung - Gesamtpruefung
|
||||
|
||||
@@ -30,6 +30,7 @@ from fastapi.responses import Response, StreamingResponse
|
||||
from pydantic import BaseModel
|
||||
|
||||
from cv_vocab_pipeline import (
|
||||
OLLAMA_REVIEW_MODEL,
|
||||
PageRegion,
|
||||
RowGeometry,
|
||||
_cells_to_vocab_entries,
|
||||
@@ -49,6 +50,7 @@ from cv_vocab_pipeline import (
|
||||
detect_row_geometry,
|
||||
dewarp_image,
|
||||
dewarp_image_manual,
|
||||
llm_review_entries,
|
||||
render_image_high_res,
|
||||
render_pdf_high_res,
|
||||
)
|
||||
@@ -1387,6 +1389,124 @@ async def get_word_ground_truth(session_id: str):
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# LLM Review Endpoints (Step 6)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@router.post("/sessions/{session_id}/llm-review")
|
||||
async def run_llm_review(session_id: str, request: Request):
|
||||
"""Run LLM-based correction on vocab entries from Step 5."""
|
||||
session = await get_session_db(session_id)
|
||||
if not session:
|
||||
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
|
||||
|
||||
word_result = session.get("word_result")
|
||||
if not word_result:
|
||||
raise HTTPException(status_code=400, detail="No word result found — run Step 5 first")
|
||||
|
||||
entries = word_result.get("vocab_entries") or word_result.get("entries") or []
|
||||
if not entries:
|
||||
raise HTTPException(status_code=400, detail="No vocab entries found — run Step 5 first")
|
||||
|
||||
# Optional model override from request body
|
||||
body = {}
|
||||
try:
|
||||
body = await request.json()
|
||||
except Exception:
|
||||
pass
|
||||
model = body.get("model") or OLLAMA_REVIEW_MODEL
|
||||
|
||||
try:
|
||||
result = await llm_review_entries(entries, model=model)
|
||||
except Exception as e:
|
||||
logger.error(f"LLM review failed for session {session_id}: {e}")
|
||||
raise HTTPException(status_code=502, detail=f"LLM review failed: {e}")
|
||||
|
||||
# Store result inside word_result as a sub-key
|
||||
word_result["llm_review"] = {
|
||||
"changes": result["changes"],
|
||||
"model_used": result["model_used"],
|
||||
"duration_ms": result["duration_ms"],
|
||||
"entries_corrected": result["entries_corrected"],
|
||||
}
|
||||
await update_session_db(session_id, word_result=word_result, current_step=6)
|
||||
|
||||
if session_id in _cache:
|
||||
_cache[session_id]["word_result"] = word_result
|
||||
|
||||
logger.info(f"LLM review session {session_id}: {len(result['changes'])} changes, "
|
||||
f"{result['duration_ms']}ms, model={result['model_used']}")
|
||||
|
||||
return {
|
||||
"session_id": session_id,
|
||||
"changes": result["changes"],
|
||||
"model_used": result["model_used"],
|
||||
"duration_ms": result["duration_ms"],
|
||||
"total_entries": len(entries),
|
||||
"corrections_found": len(result["changes"]),
|
||||
}
|
||||
|
||||
|
||||
@router.post("/sessions/{session_id}/llm-review/apply")
|
||||
async def apply_llm_corrections(session_id: str, request: Request):
|
||||
"""Apply selected LLM corrections to vocab entries."""
|
||||
session = await get_session_db(session_id)
|
||||
if not session:
|
||||
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
|
||||
|
||||
word_result = session.get("word_result")
|
||||
if not word_result:
|
||||
raise HTTPException(status_code=400, detail="No word result found")
|
||||
|
||||
llm_review = word_result.get("llm_review")
|
||||
if not llm_review:
|
||||
raise HTTPException(status_code=400, detail="No LLM review found — run /llm-review first")
|
||||
|
||||
body = await request.json()
|
||||
accepted_indices = set(body.get("accepted_indices", [])) # indices into changes[]
|
||||
|
||||
changes = llm_review.get("changes", [])
|
||||
entries = word_result.get("vocab_entries") or word_result.get("entries") or []
|
||||
|
||||
# Build a lookup: (row_index, field) -> new_value for accepted changes
|
||||
corrections = {}
|
||||
applied_count = 0
|
||||
for idx, change in enumerate(changes):
|
||||
if idx in accepted_indices:
|
||||
key = (change["row_index"], change["field"])
|
||||
corrections[key] = change["new"]
|
||||
applied_count += 1
|
||||
|
||||
# Apply corrections to entries
|
||||
for entry in entries:
|
||||
row_idx = entry.get("row_index", -1)
|
||||
for field_name in ("english", "german", "example"):
|
||||
key = (row_idx, field_name)
|
||||
if key in corrections:
|
||||
entry[field_name] = corrections[key]
|
||||
entry["llm_corrected"] = True
|
||||
|
||||
# Update word_result
|
||||
word_result["vocab_entries"] = entries
|
||||
word_result["entries"] = entries
|
||||
word_result["llm_review"]["applied_count"] = applied_count
|
||||
word_result["llm_review"]["applied_at"] = datetime.utcnow().isoformat()
|
||||
|
||||
await update_session_db(session_id, word_result=word_result)
|
||||
|
||||
if session_id in _cache:
|
||||
_cache[session_id]["word_result"] = word_result
|
||||
|
||||
logger.info(f"Applied {applied_count}/{len(changes)} LLM corrections for session {session_id}")
|
||||
|
||||
return {
|
||||
"session_id": session_id,
|
||||
"applied_count": applied_count,
|
||||
"total_changes": len(changes),
|
||||
}
|
||||
|
||||
|
||||
async def _get_rows_overlay(session_id: str) -> Response:
|
||||
"""Generate dewarped image with row bands drawn on it."""
|
||||
session = await get_session_db(session_id)
|
||||
|
||||
Reference in New Issue
Block a user