feat(ocr-pipeline): add LLM-based OCR correction step (Step 6)

Replace the placeholder "Koordinaten" step with an LLM review step that
sends vocab entries to qwen3:30b-a3b via Ollama for OCR error correction
(e.g. "8en" → "Ben"). Teachers can review, accept/reject individual
corrections in a diff table before applying them.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-02 11:13:17 +01:00
parent e9f368d3ec
commit 938d1d69cf
5 changed files with 586 additions and 5 deletions

View File

@@ -4304,3 +4304,119 @@ async def run_cv_pipeline(
result.duration_seconds = round(time.time() - total_start, 2)
return result
# ---------------------------------------------------------------------------
# LLM-based OCR Correction (Step 6)
# ---------------------------------------------------------------------------
import httpx
import os
import json as _json
import re as _re
_OLLAMA_URL = os.getenv("OLLAMA_URL", os.getenv("OLLAMA_BASE_URL", "http://host.docker.internal:11434"))
OLLAMA_REVIEW_MODEL = os.getenv("OLLAMA_REVIEW_MODEL", "qwen3:30b-a3b")
async def llm_review_entries(
entries: List[Dict],
model: str = None,
) -> Dict:
"""Send vocab entries to a local LLM for OCR error correction."""
model = model or OLLAMA_REVIEW_MODEL
# Build a compact table representation for the prompt
table_lines = []
for e in entries:
table_lines.append({
"row": e.get("row_index", 0),
"en": e.get("english", ""),
"de": e.get("german", ""),
"ex": e.get("example", ""),
})
prompt = f"""Du bist ein Korrekturleser fuer OCR-erkannte Vokabeltabellen (Englisch-Deutsch).
Die Tabelle wurde per OCR aus einem Schulbuch-Scan extrahiert. Korrigiere NUR offensichtliche OCR-Fehler.
Haeufige OCR-Fehler die du korrigieren sollst:
- Ziffern statt Buchstaben: 8→B, 0→O, 1→l/I, 5→S, 6→G
- Fehlende oder falsche Satzzeichen
- Offensichtliche Tippfehler die durch OCR entstanden sind
WICHTIG:
- Aendere NICHTS was korrekt aussieht
- Erfinde KEINE neuen Woerter oder Uebersetzungen
- Behalte Abkuerzungen wie sth., sb., etc. bei
- Behalte die exakte Struktur (gleiche Anzahl Eintraege)
Antworte NUR mit dem korrigierten JSON-Array. Kein erklaerener Text.
Fuer jeden Eintrag den du aenderst, setze "corrected": true.
Fuer unveraenderte Eintraege setze "corrected": false.
Eingabe:
{_json.dumps(table_lines, ensure_ascii=False, indent=2)}"""
t0 = time.time()
async with httpx.AsyncClient(timeout=120.0) as client:
resp = await client.post(
f"{_OLLAMA_URL}/api/chat",
json={
"model": model,
"messages": [{"role": "user", "content": prompt}],
"stream": False,
"options": {"temperature": 0.1, "num_predict": 8192},
},
)
resp.raise_for_status()
content = resp.json().get("message", {}).get("content", "")
duration_ms = int((time.time() - t0) * 1000)
# Parse LLM response — extract JSON array
corrected = _parse_llm_json_array(content)
# Build diff: compare original vs corrected
changes = []
entries_corrected = []
for i, orig in enumerate(entries):
if i < len(corrected):
c = corrected[i]
entry = dict(orig)
for field_name, key in [("english", "en"), ("german", "de"), ("example", "ex")]:
new_val = c.get(key, "").strip()
old_val = (orig.get(field_name, "") or "").strip()
if new_val and new_val != old_val:
changes.append({
"row_index": orig.get("row_index", i),
"field": field_name,
"old": old_val,
"new": new_val,
})
entry[field_name] = new_val
entry["llm_corrected"] = True
entries_corrected.append(entry)
else:
entries_corrected.append(dict(orig))
return {
"entries_original": entries,
"entries_corrected": entries_corrected,
"changes": changes,
"model_used": model,
"duration_ms": duration_ms,
}
def _parse_llm_json_array(text: str) -> List[Dict]:
"""Extract JSON array from LLM response (may contain markdown fences)."""
# Strip markdown code fences
text = _re.sub(r'```json\s*', '', text)
text = _re.sub(r'```\s*', '', text)
# Find array
match = _re.search(r'\[.*\]', text, _re.DOTALL)
if match:
try:
return _json.loads(match.group())
except (ValueError, _json.JSONDecodeError):
pass
return []