feat(ocr-pipeline): add LLM-based OCR correction step (Step 6)

Replace the placeholder "Koordinaten" step with an LLM review step that
sends vocab entries to qwen3:30b-a3b via Ollama for OCR error correction
(e.g. "8en" → "Ben"). Teachers can review, accept/reject individual
corrections in a diff table before applying them.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-02 11:13:17 +01:00
parent e9f368d3ec
commit 938d1d69cf
5 changed files with 586 additions and 5 deletions

View File

@@ -8,7 +8,7 @@ import { StepDewarp } from '@/components/ocr-pipeline/StepDewarp'
import { StepColumnDetection } from '@/components/ocr-pipeline/StepColumnDetection'
import { StepRowDetection } from '@/components/ocr-pipeline/StepRowDetection'
import { StepWordRecognition } from '@/components/ocr-pipeline/StepWordRecognition'
import { StepCoordinates } from '@/components/ocr-pipeline/StepCoordinates'
import { StepLlmReview } from '@/components/ocr-pipeline/StepLlmReview'
import { StepReconstruction } from '@/components/ocr-pipeline/StepReconstruction'
import { StepGroundTruth } from '@/components/ocr-pipeline/StepGroundTruth'
import { PIPELINE_STEPS, type PipelineStep, type SessionListItem } from './types'
@@ -155,7 +155,7 @@ export default function OcrPipelinePage() {
3: 'Spalten',
4: 'Zeilen',
5: 'Woerter',
6: 'Koordinaten',
6: 'LLM-Korrektur',
7: 'Rekonstruktion',
8: 'Validierung',
}
@@ -173,7 +173,7 @@ export default function OcrPipelinePage() {
case 4:
return <StepWordRecognition sessionId={sessionId} onNext={handleNext} goToStep={goToStep} />
case 5:
return <StepCoordinates />
return <StepLlmReview sessionId={sessionId} onNext={handleNext} />
case 6:
return <StepReconstruction />
case 7:

View File

@@ -208,7 +208,7 @@ export const PIPELINE_STEPS: PipelineStep[] = [
{ id: 'columns', name: 'Spalten', icon: '📊', status: 'pending' },
{ id: 'rows', name: 'Zeilen', icon: '📏', status: 'pending' },
{ id: 'words', name: 'Woerter', icon: '🔤', status: 'pending' },
{ id: 'coordinates', name: 'Koordinaten', icon: '📍', status: 'pending' },
{ id: 'llm-review', name: 'LLM-Korrektur', icon: '🤖', status: 'pending' },
{ id: 'reconstruction', name: 'Rekonstruktion', icon: '🏗️', status: 'pending' },
{ id: 'ground-truth', name: 'Validierung', icon: '✅', status: 'pending' },
]

View File

@@ -0,0 +1,345 @@
'use client'
import { useCallback, useState } from 'react'
const KLAUSUR_API = '/klausur-api'
interface LlmChange {
row_index: number
field: 'english' | 'german' | 'example'
old: string
new: string
}
interface LlmReviewResult {
changes: LlmChange[]
model_used: string
duration_ms: number
total_entries: number
corrections_found: number
}
interface StepLlmReviewProps {
sessionId: string | null
onNext: () => void
}
const FIELD_LABELS: Record<string, string> = {
english: 'EN',
german: 'DE',
example: 'Beispiel',
}
export function StepLlmReview({ sessionId, onNext }: StepLlmReviewProps) {
const [status, setStatus] = useState<'idle' | 'running' | 'done' | 'error' | 'applied'>('idle')
const [result, setResult] = useState<LlmReviewResult | null>(null)
const [error, setError] = useState<string>('')
const [accepted, setAccepted] = useState<Set<number>>(new Set())
const [applying, setApplying] = useState(false)
const runReview = useCallback(async () => {
if (!sessionId) return
setStatus('running')
setError('')
setResult(null)
try {
const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/llm-review`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({}),
})
if (!res.ok) {
const data = await res.json().catch(() => ({}))
throw new Error(data.detail || `HTTP ${res.status}`)
}
const data: LlmReviewResult = await res.json()
setResult(data)
// Accept all changes by default
setAccepted(new Set(data.changes.map((_, i) => i)))
setStatus('done')
} catch (e: unknown) {
const msg = e instanceof Error ? e.message : String(e)
setError(msg)
setStatus('error')
}
}, [sessionId])
const toggleChange = (index: number) => {
setAccepted((prev) => {
const next = new Set(prev)
if (next.has(index)) next.delete(index)
else next.add(index)
return next
})
}
const toggleAll = () => {
if (!result) return
if (accepted.size === result.changes.length) {
setAccepted(new Set())
} else {
setAccepted(new Set(result.changes.map((_, i) => i)))
}
}
const applyChanges = useCallback(async () => {
if (!sessionId || !result) return
setApplying(true)
try {
const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/llm-review/apply`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ accepted_indices: Array.from(accepted) }),
})
if (!res.ok) {
const data = await res.json().catch(() => ({}))
throw new Error(data.detail || `HTTP ${res.status}`)
}
setStatus('applied')
} catch (e: unknown) {
const msg = e instanceof Error ? e.message : String(e)
setError(msg)
} finally {
setApplying(false)
}
}, [sessionId, result, accepted])
if (!sessionId) {
return (
<div className="text-center py-12 text-gray-400">
Bitte zuerst eine Session auswaehlen.
</div>
)
}
// --- Idle state ---
if (status === 'idle') {
return (
<div className="flex flex-col items-center justify-center py-12 text-center">
<div className="text-5xl mb-4">🤖</div>
<h3 className="text-lg font-medium text-gray-700 dark:text-gray-300 mb-2">
Schritt 6: LLM-Korrektur
</h3>
<p className="text-gray-500 dark:text-gray-400 max-w-lg mb-2">
Ein lokales Sprachmodell prueft die OCR-Ergebnisse auf typische Erkennungsfehler
(z.B. &quot;8en&quot; statt &quot;Ben&quot;) und schlaegt Korrekturen vor.
</p>
<p className="text-xs text-gray-400 dark:text-gray-500 mb-6">
Modell: <span className="font-mono">qwen3:30b-a3b</span> via Ollama (lokal)
</p>
<button
onClick={runReview}
className="px-6 py-2.5 bg-teal-600 text-white rounded-lg hover:bg-teal-700 transition-colors font-medium"
>
LLM-Korrektur starten
</button>
</div>
)
}
// --- Running state ---
if (status === 'running') {
return (
<div className="flex flex-col items-center justify-center py-16 text-center">
<div className="animate-spin rounded-full h-10 w-10 border-b-2 border-teal-500 mb-4" />
<h3 className="text-lg font-medium text-gray-700 dark:text-gray-300 mb-1">
Korrektur laeuft...
</h3>
<p className="text-sm text-gray-400">
<span className="font-mono">qwen3:30b-a3b</span> prueft die Vokabeleintraege
</p>
</div>
)
}
// --- Error state ---
if (status === 'error') {
return (
<div className="flex flex-col items-center justify-center py-12 text-center">
<div className="text-5xl mb-4"></div>
<h3 className="text-lg font-medium text-red-600 dark:text-red-400 mb-2">
Fehler bei LLM-Korrektur
</h3>
<p className="text-sm text-gray-500 dark:text-gray-400 max-w-lg mb-4">
{error}
</p>
<div className="flex gap-3">
<button
onClick={runReview}
className="px-5 py-2 bg-teal-600 text-white rounded-lg hover:bg-teal-700 transition-colors text-sm"
>
Erneut versuchen
</button>
<button
onClick={onNext}
className="px-5 py-2 bg-gray-200 dark:bg-gray-700 text-gray-700 dark:text-gray-300 rounded-lg hover:bg-gray-300 dark:hover:bg-gray-600 transition-colors text-sm"
>
Ueberspringen
</button>
</div>
</div>
)
}
// --- Applied state ---
if (status === 'applied') {
return (
<div className="flex flex-col items-center justify-center py-12 text-center">
<div className="text-5xl mb-4"></div>
<h3 className="text-lg font-medium text-gray-700 dark:text-gray-300 mb-2">
Korrekturen uebernommen
</h3>
<p className="text-sm text-gray-500 dark:text-gray-400 mb-6">
{accepted.size} von {result?.changes.length ?? 0} Korrekturen wurden angewendet.
</p>
<button
onClick={onNext}
className="px-6 py-2.5 bg-teal-600 text-white rounded-lg hover:bg-teal-700 transition-colors font-medium"
>
Weiter
</button>
</div>
)
}
// --- Done state: show diff table ---
const changes = result?.changes ?? []
if (changes.length === 0) {
return (
<div className="flex flex-col items-center justify-center py-12 text-center">
<div className="text-5xl mb-4">👍</div>
<h3 className="text-lg font-medium text-gray-700 dark:text-gray-300 mb-2">
Keine Korrekturen noetig
</h3>
<p className="text-sm text-gray-500 dark:text-gray-400 mb-1">
Das LLM hat keine OCR-Fehler gefunden.
</p>
<p className="text-xs text-gray-400 dark:text-gray-500 mb-6">
{result?.total_entries} Eintraege geprueft in {result?.duration_ms}ms
({result?.model_used})
</p>
<button
onClick={onNext}
className="px-6 py-2.5 bg-teal-600 text-white rounded-lg hover:bg-teal-700 transition-colors font-medium"
>
Weiter
</button>
</div>
)
}
return (
<div className="space-y-4">
{/* Header */}
<div className="flex items-center justify-between">
<div>
<h3 className="text-base font-medium text-gray-700 dark:text-gray-300">
LLM-Korrekturvorschlaege
</h3>
<p className="text-xs text-gray-400 mt-0.5">
{changes.length} Korrektur{changes.length !== 1 ? 'en' : ''} gefunden
· {result?.duration_ms}ms · {result?.model_used}
</p>
</div>
<div className="flex items-center gap-2">
<button
onClick={toggleAll}
className="text-xs px-3 py-1.5 border border-gray-300 dark:border-gray-600 rounded-lg hover:bg-gray-50 dark:hover:bg-gray-700 transition-colors text-gray-600 dark:text-gray-400"
>
{accepted.size === changes.length ? 'Keine' : 'Alle'} auswaehlen
</button>
</div>
</div>
{/* Diff table */}
<div className="border border-gray-200 dark:border-gray-700 rounded-lg overflow-hidden">
<table className="w-full text-sm">
<thead>
<tr className="bg-gray-50 dark:bg-gray-800 border-b border-gray-200 dark:border-gray-700">
<th className="w-10 px-3 py-2 text-center">
<input
type="checkbox"
checked={accepted.size === changes.length}
onChange={toggleAll}
className="rounded border-gray-300 dark:border-gray-600"
/>
</th>
<th className="px-3 py-2 text-left text-gray-500 dark:text-gray-400 font-medium">Zeile</th>
<th className="px-3 py-2 text-left text-gray-500 dark:text-gray-400 font-medium">Feld</th>
<th className="px-3 py-2 text-left text-gray-500 dark:text-gray-400 font-medium">Vorher</th>
<th className="px-3 py-2 text-left text-gray-500 dark:text-gray-400 font-medium">Nachher</th>
</tr>
</thead>
<tbody>
{changes.map((change, idx) => (
<tr
key={idx}
className={`border-b border-gray-100 dark:border-gray-700/50 ${
accepted.has(idx)
? 'bg-teal-50/50 dark:bg-teal-900/10'
: 'bg-white dark:bg-gray-800/50'
}`}
>
<td className="px-3 py-2 text-center">
<input
type="checkbox"
checked={accepted.has(idx)}
onChange={() => toggleChange(idx)}
className="rounded border-gray-300 dark:border-gray-600"
/>
</td>
<td className="px-3 py-2 text-gray-500 dark:text-gray-400 font-mono text-xs">
R{change.row_index}
</td>
<td className="px-3 py-2">
<span className="text-xs px-1.5 py-0.5 rounded bg-gray-100 dark:bg-gray-700 text-gray-600 dark:text-gray-400">
{FIELD_LABELS[change.field] || change.field}
</span>
</td>
<td className="px-3 py-2">
<span className="line-through text-red-500 dark:text-red-400">
{change.old}
</span>
</td>
<td className="px-3 py-2">
<span className="text-green-600 dark:text-green-400 font-medium">
{change.new}
</span>
</td>
</tr>
))}
</tbody>
</table>
</div>
{/* Actions */}
<div className="flex items-center justify-between pt-2">
<p className="text-xs text-gray-400">
{accepted.size} von {changes.length} ausgewaehlt
</p>
<div className="flex gap-3">
<button
onClick={onNext}
className="px-4 py-2 text-sm border border-gray-300 dark:border-gray-600 rounded-lg hover:bg-gray-50 dark:hover:bg-gray-700 transition-colors text-gray-600 dark:text-gray-400"
>
Alle ablehnen
</button>
<button
onClick={applyChanges}
disabled={applying || accepted.size === 0}
className="px-5 py-2 text-sm bg-teal-600 text-white rounded-lg hover:bg-teal-700 disabled:opacity-50 disabled:cursor-not-allowed transition-colors font-medium"
>
{applying ? 'Wird uebernommen...' : `${accepted.size} Korrektur${accepted.size !== 1 ? 'en' : ''} uebernehmen`}
</button>
</div>
</div>
</div>
)
}

View File

@@ -4304,3 +4304,119 @@ async def run_cv_pipeline(
result.duration_seconds = round(time.time() - total_start, 2)
return result
# ---------------------------------------------------------------------------
# LLM-based OCR Correction (Step 6)
# ---------------------------------------------------------------------------
import httpx
import os
import json as _json
import re as _re
_OLLAMA_URL = os.getenv("OLLAMA_URL", os.getenv("OLLAMA_BASE_URL", "http://host.docker.internal:11434"))
OLLAMA_REVIEW_MODEL = os.getenv("OLLAMA_REVIEW_MODEL", "qwen3:30b-a3b")
async def llm_review_entries(
entries: List[Dict],
model: str = None,
) -> Dict:
"""Send vocab entries to a local LLM for OCR error correction."""
model = model or OLLAMA_REVIEW_MODEL
# Build a compact table representation for the prompt
table_lines = []
for e in entries:
table_lines.append({
"row": e.get("row_index", 0),
"en": e.get("english", ""),
"de": e.get("german", ""),
"ex": e.get("example", ""),
})
prompt = f"""Du bist ein Korrekturleser fuer OCR-erkannte Vokabeltabellen (Englisch-Deutsch).
Die Tabelle wurde per OCR aus einem Schulbuch-Scan extrahiert. Korrigiere NUR offensichtliche OCR-Fehler.
Haeufige OCR-Fehler die du korrigieren sollst:
- Ziffern statt Buchstaben: 8→B, 0→O, 1→l/I, 5→S, 6→G
- Fehlende oder falsche Satzzeichen
- Offensichtliche Tippfehler die durch OCR entstanden sind
WICHTIG:
- Aendere NICHTS was korrekt aussieht
- Erfinde KEINE neuen Woerter oder Uebersetzungen
- Behalte Abkuerzungen wie sth., sb., etc. bei
- Behalte die exakte Struktur (gleiche Anzahl Eintraege)
Antworte NUR mit dem korrigierten JSON-Array. Kein erklaerener Text.
Fuer jeden Eintrag den du aenderst, setze "corrected": true.
Fuer unveraenderte Eintraege setze "corrected": false.
Eingabe:
{_json.dumps(table_lines, ensure_ascii=False, indent=2)}"""
t0 = time.time()
async with httpx.AsyncClient(timeout=120.0) as client:
resp = await client.post(
f"{_OLLAMA_URL}/api/chat",
json={
"model": model,
"messages": [{"role": "user", "content": prompt}],
"stream": False,
"options": {"temperature": 0.1, "num_predict": 8192},
},
)
resp.raise_for_status()
content = resp.json().get("message", {}).get("content", "")
duration_ms = int((time.time() - t0) * 1000)
# Parse LLM response — extract JSON array
corrected = _parse_llm_json_array(content)
# Build diff: compare original vs corrected
changes = []
entries_corrected = []
for i, orig in enumerate(entries):
if i < len(corrected):
c = corrected[i]
entry = dict(orig)
for field_name, key in [("english", "en"), ("german", "de"), ("example", "ex")]:
new_val = c.get(key, "").strip()
old_val = (orig.get(field_name, "") or "").strip()
if new_val and new_val != old_val:
changes.append({
"row_index": orig.get("row_index", i),
"field": field_name,
"old": old_val,
"new": new_val,
})
entry[field_name] = new_val
entry["llm_corrected"] = True
entries_corrected.append(entry)
else:
entries_corrected.append(dict(orig))
return {
"entries_original": entries,
"entries_corrected": entries_corrected,
"changes": changes,
"model_used": model,
"duration_ms": duration_ms,
}
def _parse_llm_json_array(text: str) -> List[Dict]:
"""Extract JSON array from LLM response (may contain markdown fences)."""
# Strip markdown code fences
text = _re.sub(r'```json\s*', '', text)
text = _re.sub(r'```\s*', '', text)
# Find array
match = _re.search(r'\[.*\]', text, _re.DOTALL)
if match:
try:
return _json.loads(match.group())
except (ValueError, _json.JSONDecodeError):
pass
return []

View File

@@ -7,7 +7,7 @@ Zerlegt den OCR-Prozess in 8 einzelne Schritte:
3. Spaltenerkennung - Unsichtbare Spalten finden
4. Zeilenerkennung - Horizontale Zeilen + Kopf-/Fusszeilen
5. Worterkennung - OCR mit Bounding Boxes
6. Koordinatenzuweisung - Exakte Positionen
6. LLM-Korrektur - OCR-Fehler per LLM korrigieren
7. Seitenrekonstruktion - Seite nachbauen
8. Ground Truth Validierung - Gesamtpruefung
@@ -30,6 +30,7 @@ from fastapi.responses import Response, StreamingResponse
from pydantic import BaseModel
from cv_vocab_pipeline import (
OLLAMA_REVIEW_MODEL,
PageRegion,
RowGeometry,
_cells_to_vocab_entries,
@@ -49,6 +50,7 @@ from cv_vocab_pipeline import (
detect_row_geometry,
dewarp_image,
dewarp_image_manual,
llm_review_entries,
render_image_high_res,
render_pdf_high_res,
)
@@ -1387,6 +1389,124 @@ async def get_word_ground_truth(session_id: str):
}
# ---------------------------------------------------------------------------
# LLM Review Endpoints (Step 6)
# ---------------------------------------------------------------------------
@router.post("/sessions/{session_id}/llm-review")
async def run_llm_review(session_id: str, request: Request):
"""Run LLM-based correction on vocab entries from Step 5."""
session = await get_session_db(session_id)
if not session:
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
word_result = session.get("word_result")
if not word_result:
raise HTTPException(status_code=400, detail="No word result found — run Step 5 first")
entries = word_result.get("vocab_entries") or word_result.get("entries") or []
if not entries:
raise HTTPException(status_code=400, detail="No vocab entries found — run Step 5 first")
# Optional model override from request body
body = {}
try:
body = await request.json()
except Exception:
pass
model = body.get("model") or OLLAMA_REVIEW_MODEL
try:
result = await llm_review_entries(entries, model=model)
except Exception as e:
logger.error(f"LLM review failed for session {session_id}: {e}")
raise HTTPException(status_code=502, detail=f"LLM review failed: {e}")
# Store result inside word_result as a sub-key
word_result["llm_review"] = {
"changes": result["changes"],
"model_used": result["model_used"],
"duration_ms": result["duration_ms"],
"entries_corrected": result["entries_corrected"],
}
await update_session_db(session_id, word_result=word_result, current_step=6)
if session_id in _cache:
_cache[session_id]["word_result"] = word_result
logger.info(f"LLM review session {session_id}: {len(result['changes'])} changes, "
f"{result['duration_ms']}ms, model={result['model_used']}")
return {
"session_id": session_id,
"changes": result["changes"],
"model_used": result["model_used"],
"duration_ms": result["duration_ms"],
"total_entries": len(entries),
"corrections_found": len(result["changes"]),
}
@router.post("/sessions/{session_id}/llm-review/apply")
async def apply_llm_corrections(session_id: str, request: Request):
"""Apply selected LLM corrections to vocab entries."""
session = await get_session_db(session_id)
if not session:
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
word_result = session.get("word_result")
if not word_result:
raise HTTPException(status_code=400, detail="No word result found")
llm_review = word_result.get("llm_review")
if not llm_review:
raise HTTPException(status_code=400, detail="No LLM review found — run /llm-review first")
body = await request.json()
accepted_indices = set(body.get("accepted_indices", [])) # indices into changes[]
changes = llm_review.get("changes", [])
entries = word_result.get("vocab_entries") or word_result.get("entries") or []
# Build a lookup: (row_index, field) -> new_value for accepted changes
corrections = {}
applied_count = 0
for idx, change in enumerate(changes):
if idx in accepted_indices:
key = (change["row_index"], change["field"])
corrections[key] = change["new"]
applied_count += 1
# Apply corrections to entries
for entry in entries:
row_idx = entry.get("row_index", -1)
for field_name in ("english", "german", "example"):
key = (row_idx, field_name)
if key in corrections:
entry[field_name] = corrections[key]
entry["llm_corrected"] = True
# Update word_result
word_result["vocab_entries"] = entries
word_result["entries"] = entries
word_result["llm_review"]["applied_count"] = applied_count
word_result["llm_review"]["applied_at"] = datetime.utcnow().isoformat()
await update_session_db(session_id, word_result=word_result)
if session_id in _cache:
_cache[session_id]["word_result"] = word_result
logger.info(f"Applied {applied_count}/{len(changes)} LLM corrections for session {session_id}")
return {
"session_id": session_id,
"applied_count": applied_count,
"total_changes": len(changes),
}
async def _get_rows_overlay(session_id: str) -> Response:
"""Generate dewarped image with row bands drawn on it."""
session = await get_session_db(session_id)