feat(ocr-pipeline): add LLM-based OCR correction step (Step 6)

Replace the placeholder "Koordinaten" step with an LLM review step that sends vocab entries to qwen3:30b-a3b via Ollama for OCR error correction (e.g. "8en" → "Ben"). Teachers can review, accept/reject individual corrections in a diff table before applying them. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-02 11:13:17 +01:00
parent e9f368d3ec
commit 938d1d69cf
5 changed files with 586 additions and 5 deletions
--- a/klausur-service/backend/cv_vocab_pipeline.py
+++ b/klausur-service/backend/cv_vocab_pipeline.py
@@ -4304,3 +4304,119 @@ async def run_cv_pipeline(
        result.duration_seconds = round(time.time() - total_start, 2)

    return result
+
+
+# ---------------------------------------------------------------------------
+# LLM-based OCR Correction (Step 6)
+# ---------------------------------------------------------------------------
+
+import httpx
+import os
+import json as _json
+import re as _re
+
+_OLLAMA_URL = os.getenv("OLLAMA_URL", os.getenv("OLLAMA_BASE_URL", "http://host.docker.internal:11434"))
+OLLAMA_REVIEW_MODEL = os.getenv("OLLAMA_REVIEW_MODEL", "qwen3:30b-a3b")
+
+
+async def llm_review_entries(
+    entries: List[Dict],
+    model: str = None,
+) -> Dict:
+    """Send vocab entries to a local LLM for OCR error correction."""
+    model = model or OLLAMA_REVIEW_MODEL
+
+    # Build a compact table representation for the prompt
+    table_lines = []
+    for e in entries:
+        table_lines.append({
+            "row": e.get("row_index", 0),
+            "en": e.get("english", ""),
+            "de": e.get("german", ""),
+            "ex": e.get("example", ""),
+        })
+
+    prompt = f"""Du bist ein Korrekturleser fuer OCR-erkannte Vokabeltabellen (Englisch-Deutsch).
+Die Tabelle wurde per OCR aus einem Schulbuch-Scan extrahiert. Korrigiere NUR offensichtliche OCR-Fehler.
+
+Haeufige OCR-Fehler die du korrigieren sollst:
+- Ziffern statt Buchstaben: 8→B, 0→O, 1→l/I, 5→S, 6→G
+- Fehlende oder falsche Satzzeichen
+- Offensichtliche Tippfehler die durch OCR entstanden sind
+
+WICHTIG:
+- Aendere NICHTS was korrekt aussieht
+- Erfinde KEINE neuen Woerter oder Uebersetzungen
+- Behalte Abkuerzungen wie sth., sb., etc. bei
+- Behalte die exakte Struktur (gleiche Anzahl Eintraege)
+
+Antworte NUR mit dem korrigierten JSON-Array. Kein erklaerener Text.
+Fuer jeden Eintrag den du aenderst, setze "corrected": true.
+Fuer unveraenderte Eintraege setze "corrected": false.
+
+Eingabe:
+{_json.dumps(table_lines, ensure_ascii=False, indent=2)}"""
+
+    t0 = time.time()
+    async with httpx.AsyncClient(timeout=120.0) as client:
+        resp = await client.post(
+            f"{_OLLAMA_URL}/api/chat",
+            json={
+                "model": model,
+                "messages": [{"role": "user", "content": prompt}],
+                "stream": False,
+                "options": {"temperature": 0.1, "num_predict": 8192},
+            },
+        )
+        resp.raise_for_status()
+        content = resp.json().get("message", {}).get("content", "")
+    duration_ms = int((time.time() - t0) * 1000)
+
+    # Parse LLM response — extract JSON array
+    corrected = _parse_llm_json_array(content)
+
+    # Build diff: compare original vs corrected
+    changes = []
+    entries_corrected = []
+    for i, orig in enumerate(entries):
+        if i < len(corrected):
+            c = corrected[i]
+            entry = dict(orig)
+            for field_name, key in [("english", "en"), ("german", "de"), ("example", "ex")]:
+                new_val = c.get(key, "").strip()
+                old_val = (orig.get(field_name, "") or "").strip()
+                if new_val and new_val != old_val:
+                    changes.append({
+                        "row_index": orig.get("row_index", i),
+                        "field": field_name,
+                        "old": old_val,
+                        "new": new_val,
+                    })
+                    entry[field_name] = new_val
+                    entry["llm_corrected"] = True
+            entries_corrected.append(entry)
+        else:
+            entries_corrected.append(dict(orig))
+
+    return {
+        "entries_original": entries,
+        "entries_corrected": entries_corrected,
+        "changes": changes,
+        "model_used": model,
+        "duration_ms": duration_ms,
+    }
+
+
+def _parse_llm_json_array(text: str) -> List[Dict]:
+    """Extract JSON array from LLM response (may contain markdown fences)."""
+    # Strip markdown code fences
+    text = _re.sub(r'```json\s*', '', text)
+    text = _re.sub(r'```\s*', '', text)
+    # Find array
+    match = _re.search(r'\[.*\]', text, _re.DOTALL)
+    if match:
+        try:
+            return _json.loads(match.group())
+        except (ValueError, _json.JSONDecodeError):
+            pass
+    return []