diff --git a/admin-lehrer/app/(admin)/ai/ocr-pipeline/page.tsx b/admin-lehrer/app/(admin)/ai/ocr-pipeline/page.tsx index 4e45621..cecf749 100644 --- a/admin-lehrer/app/(admin)/ai/ocr-pipeline/page.tsx +++ b/admin-lehrer/app/(admin)/ai/ocr-pipeline/page.tsx @@ -8,7 +8,7 @@ import { StepDewarp } from '@/components/ocr-pipeline/StepDewarp' import { StepColumnDetection } from '@/components/ocr-pipeline/StepColumnDetection' import { StepRowDetection } from '@/components/ocr-pipeline/StepRowDetection' import { StepWordRecognition } from '@/components/ocr-pipeline/StepWordRecognition' -import { StepCoordinates } from '@/components/ocr-pipeline/StepCoordinates' +import { StepLlmReview } from '@/components/ocr-pipeline/StepLlmReview' import { StepReconstruction } from '@/components/ocr-pipeline/StepReconstruction' import { StepGroundTruth } from '@/components/ocr-pipeline/StepGroundTruth' import { PIPELINE_STEPS, type PipelineStep, type SessionListItem } from './types' @@ -155,7 +155,7 @@ export default function OcrPipelinePage() { 3: 'Spalten', 4: 'Zeilen', 5: 'Woerter', - 6: 'Koordinaten', + 6: 'LLM-Korrektur', 7: 'Rekonstruktion', 8: 'Validierung', } @@ -173,7 +173,7 @@ export default function OcrPipelinePage() { case 4: return case 5: - return + return case 6: return case 7: diff --git a/admin-lehrer/app/(admin)/ai/ocr-pipeline/types.ts b/admin-lehrer/app/(admin)/ai/ocr-pipeline/types.ts index 3cec591..ac9bdc4 100644 --- a/admin-lehrer/app/(admin)/ai/ocr-pipeline/types.ts +++ b/admin-lehrer/app/(admin)/ai/ocr-pipeline/types.ts @@ -208,7 +208,7 @@ export const PIPELINE_STEPS: PipelineStep[] = [ { id: 'columns', name: 'Spalten', icon: '๐Ÿ“Š', status: 'pending' }, { id: 'rows', name: 'Zeilen', icon: '๐Ÿ“', status: 'pending' }, { id: 'words', name: 'Woerter', icon: '๐Ÿ”ค', status: 'pending' }, - { id: 'coordinates', name: 'Koordinaten', icon: '๐Ÿ“', status: 'pending' }, + { id: 'llm-review', name: 'LLM-Korrektur', icon: '๐Ÿค–', status: 'pending' }, { id: 'reconstruction', name: 'Rekonstruktion', icon: '๐Ÿ—๏ธ', status: 'pending' }, { id: 'ground-truth', name: 'Validierung', icon: 'โœ…', status: 'pending' }, ] diff --git a/admin-lehrer/components/ocr-pipeline/StepLlmReview.tsx b/admin-lehrer/components/ocr-pipeline/StepLlmReview.tsx new file mode 100644 index 0000000..81dfac4 --- /dev/null +++ b/admin-lehrer/components/ocr-pipeline/StepLlmReview.tsx @@ -0,0 +1,345 @@ +'use client' + +import { useCallback, useState } from 'react' + +const KLAUSUR_API = '/klausur-api' + +interface LlmChange { + row_index: number + field: 'english' | 'german' | 'example' + old: string + new: string +} + +interface LlmReviewResult { + changes: LlmChange[] + model_used: string + duration_ms: number + total_entries: number + corrections_found: number +} + +interface StepLlmReviewProps { + sessionId: string | null + onNext: () => void +} + +const FIELD_LABELS: Record = { + english: 'EN', + german: 'DE', + example: 'Beispiel', +} + +export function StepLlmReview({ sessionId, onNext }: StepLlmReviewProps) { + const [status, setStatus] = useState<'idle' | 'running' | 'done' | 'error' | 'applied'>('idle') + const [result, setResult] = useState(null) + const [error, setError] = useState('') + const [accepted, setAccepted] = useState>(new Set()) + const [applying, setApplying] = useState(false) + + const runReview = useCallback(async () => { + if (!sessionId) return + setStatus('running') + setError('') + setResult(null) + + try { + const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/llm-review`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({}), + }) + + if (!res.ok) { + const data = await res.json().catch(() => ({})) + throw new Error(data.detail || `HTTP ${res.status}`) + } + + const data: LlmReviewResult = await res.json() + setResult(data) + // Accept all changes by default + setAccepted(new Set(data.changes.map((_, i) => i))) + setStatus('done') + } catch (e: unknown) { + const msg = e instanceof Error ? e.message : String(e) + setError(msg) + setStatus('error') + } + }, [sessionId]) + + const toggleChange = (index: number) => { + setAccepted((prev) => { + const next = new Set(prev) + if (next.has(index)) next.delete(index) + else next.add(index) + return next + }) + } + + const toggleAll = () => { + if (!result) return + if (accepted.size === result.changes.length) { + setAccepted(new Set()) + } else { + setAccepted(new Set(result.changes.map((_, i) => i))) + } + } + + const applyChanges = useCallback(async () => { + if (!sessionId || !result) return + setApplying(true) + + try { + const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/llm-review/apply`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ accepted_indices: Array.from(accepted) }), + }) + + if (!res.ok) { + const data = await res.json().catch(() => ({})) + throw new Error(data.detail || `HTTP ${res.status}`) + } + + setStatus('applied') + } catch (e: unknown) { + const msg = e instanceof Error ? e.message : String(e) + setError(msg) + } finally { + setApplying(false) + } + }, [sessionId, result, accepted]) + + if (!sessionId) { + return ( +
+ Bitte zuerst eine Session auswaehlen. +
+ ) + } + + // --- Idle state --- + if (status === 'idle') { + return ( +
+
๐Ÿค–
+

+ Schritt 6: LLM-Korrektur +

+

+ Ein lokales Sprachmodell prueft die OCR-Ergebnisse auf typische Erkennungsfehler + (z.B. "8en" statt "Ben") und schlaegt Korrekturen vor. +

+

+ Modell: qwen3:30b-a3b via Ollama (lokal) +

+ +
+ ) + } + + // --- Running state --- + if (status === 'running') { + return ( +
+
+

+ Korrektur laeuft... +

+

+ qwen3:30b-a3b prueft die Vokabeleintraege +

+
+ ) + } + + // --- Error state --- + if (status === 'error') { + return ( +
+
โš ๏ธ
+

+ Fehler bei LLM-Korrektur +

+

+ {error} +

+
+ + +
+
+ ) + } + + // --- Applied state --- + if (status === 'applied') { + return ( +
+
โœ…
+

+ Korrekturen uebernommen +

+

+ {accepted.size} von {result?.changes.length ?? 0} Korrekturen wurden angewendet. +

+ +
+ ) + } + + // --- Done state: show diff table --- + const changes = result?.changes ?? [] + + if (changes.length === 0) { + return ( +
+
๐Ÿ‘
+

+ Keine Korrekturen noetig +

+

+ Das LLM hat keine OCR-Fehler gefunden. +

+

+ {result?.total_entries} Eintraege geprueft in {result?.duration_ms}ms + ({result?.model_used}) +

+ +
+ ) + } + + return ( +
+ {/* Header */} +
+
+

+ LLM-Korrekturvorschlaege +

+

+ {changes.length} Korrektur{changes.length !== 1 ? 'en' : ''} gefunden + ยท {result?.duration_ms}ms ยท {result?.model_used} +

+
+
+ +
+
+ + {/* Diff table */} +
+ + + + + + + + + + + + {changes.map((change, idx) => ( + + + + + + + + ))} + +
+ + ZeileFeldVorherNachher
+ toggleChange(idx)} + className="rounded border-gray-300 dark:border-gray-600" + /> + + R{change.row_index} + + + {FIELD_LABELS[change.field] || change.field} + + + + {change.old} + + + + {change.new} + +
+
+ + {/* Actions */} +
+

+ {accepted.size} von {changes.length} ausgewaehlt +

+
+ + +
+
+
+ ) +} diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index 3c4d0f1..c1bd756 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -4304,3 +4304,119 @@ async def run_cv_pipeline( result.duration_seconds = round(time.time() - total_start, 2) return result + + +# --------------------------------------------------------------------------- +# LLM-based OCR Correction (Step 6) +# --------------------------------------------------------------------------- + +import httpx +import os +import json as _json +import re as _re + +_OLLAMA_URL = os.getenv("OLLAMA_URL", os.getenv("OLLAMA_BASE_URL", "http://host.docker.internal:11434")) +OLLAMA_REVIEW_MODEL = os.getenv("OLLAMA_REVIEW_MODEL", "qwen3:30b-a3b") + + +async def llm_review_entries( + entries: List[Dict], + model: str = None, +) -> Dict: + """Send vocab entries to a local LLM for OCR error correction.""" + model = model or OLLAMA_REVIEW_MODEL + + # Build a compact table representation for the prompt + table_lines = [] + for e in entries: + table_lines.append({ + "row": e.get("row_index", 0), + "en": e.get("english", ""), + "de": e.get("german", ""), + "ex": e.get("example", ""), + }) + + prompt = f"""Du bist ein Korrekturleser fuer OCR-erkannte Vokabeltabellen (Englisch-Deutsch). +Die Tabelle wurde per OCR aus einem Schulbuch-Scan extrahiert. Korrigiere NUR offensichtliche OCR-Fehler. + +Haeufige OCR-Fehler die du korrigieren sollst: +- Ziffern statt Buchstaben: 8โ†’B, 0โ†’O, 1โ†’l/I, 5โ†’S, 6โ†’G +- Fehlende oder falsche Satzzeichen +- Offensichtliche Tippfehler die durch OCR entstanden sind + +WICHTIG: +- Aendere NICHTS was korrekt aussieht +- Erfinde KEINE neuen Woerter oder Uebersetzungen +- Behalte Abkuerzungen wie sth., sb., etc. bei +- Behalte die exakte Struktur (gleiche Anzahl Eintraege) + +Antworte NUR mit dem korrigierten JSON-Array. Kein erklaerener Text. +Fuer jeden Eintrag den du aenderst, setze "corrected": true. +Fuer unveraenderte Eintraege setze "corrected": false. + +Eingabe: +{_json.dumps(table_lines, ensure_ascii=False, indent=2)}""" + + t0 = time.time() + async with httpx.AsyncClient(timeout=120.0) as client: + resp = await client.post( + f"{_OLLAMA_URL}/api/chat", + json={ + "model": model, + "messages": [{"role": "user", "content": prompt}], + "stream": False, + "options": {"temperature": 0.1, "num_predict": 8192}, + }, + ) + resp.raise_for_status() + content = resp.json().get("message", {}).get("content", "") + duration_ms = int((time.time() - t0) * 1000) + + # Parse LLM response โ€” extract JSON array + corrected = _parse_llm_json_array(content) + + # Build diff: compare original vs corrected + changes = [] + entries_corrected = [] + for i, orig in enumerate(entries): + if i < len(corrected): + c = corrected[i] + entry = dict(orig) + for field_name, key in [("english", "en"), ("german", "de"), ("example", "ex")]: + new_val = c.get(key, "").strip() + old_val = (orig.get(field_name, "") or "").strip() + if new_val and new_val != old_val: + changes.append({ + "row_index": orig.get("row_index", i), + "field": field_name, + "old": old_val, + "new": new_val, + }) + entry[field_name] = new_val + entry["llm_corrected"] = True + entries_corrected.append(entry) + else: + entries_corrected.append(dict(orig)) + + return { + "entries_original": entries, + "entries_corrected": entries_corrected, + "changes": changes, + "model_used": model, + "duration_ms": duration_ms, + } + + +def _parse_llm_json_array(text: str) -> List[Dict]: + """Extract JSON array from LLM response (may contain markdown fences).""" + # Strip markdown code fences + text = _re.sub(r'```json\s*', '', text) + text = _re.sub(r'```\s*', '', text) + # Find array + match = _re.search(r'\[.*\]', text, _re.DOTALL) + if match: + try: + return _json.loads(match.group()) + except (ValueError, _json.JSONDecodeError): + pass + return [] diff --git a/klausur-service/backend/ocr_pipeline_api.py b/klausur-service/backend/ocr_pipeline_api.py index b0a95fc..23ac386 100644 --- a/klausur-service/backend/ocr_pipeline_api.py +++ b/klausur-service/backend/ocr_pipeline_api.py @@ -7,7 +7,7 @@ Zerlegt den OCR-Prozess in 8 einzelne Schritte: 3. Spaltenerkennung - Unsichtbare Spalten finden 4. Zeilenerkennung - Horizontale Zeilen + Kopf-/Fusszeilen 5. Worterkennung - OCR mit Bounding Boxes -6. Koordinatenzuweisung - Exakte Positionen +6. LLM-Korrektur - OCR-Fehler per LLM korrigieren 7. Seitenrekonstruktion - Seite nachbauen 8. Ground Truth Validierung - Gesamtpruefung @@ -30,6 +30,7 @@ from fastapi.responses import Response, StreamingResponse from pydantic import BaseModel from cv_vocab_pipeline import ( + OLLAMA_REVIEW_MODEL, PageRegion, RowGeometry, _cells_to_vocab_entries, @@ -49,6 +50,7 @@ from cv_vocab_pipeline import ( detect_row_geometry, dewarp_image, dewarp_image_manual, + llm_review_entries, render_image_high_res, render_pdf_high_res, ) @@ -1387,6 +1389,124 @@ async def get_word_ground_truth(session_id: str): } +# --------------------------------------------------------------------------- +# LLM Review Endpoints (Step 6) +# --------------------------------------------------------------------------- + + +@router.post("/sessions/{session_id}/llm-review") +async def run_llm_review(session_id: str, request: Request): + """Run LLM-based correction on vocab entries from Step 5.""" + session = await get_session_db(session_id) + if not session: + raise HTTPException(status_code=404, detail=f"Session {session_id} not found") + + word_result = session.get("word_result") + if not word_result: + raise HTTPException(status_code=400, detail="No word result found โ€” run Step 5 first") + + entries = word_result.get("vocab_entries") or word_result.get("entries") or [] + if not entries: + raise HTTPException(status_code=400, detail="No vocab entries found โ€” run Step 5 first") + + # Optional model override from request body + body = {} + try: + body = await request.json() + except Exception: + pass + model = body.get("model") or OLLAMA_REVIEW_MODEL + + try: + result = await llm_review_entries(entries, model=model) + except Exception as e: + logger.error(f"LLM review failed for session {session_id}: {e}") + raise HTTPException(status_code=502, detail=f"LLM review failed: {e}") + + # Store result inside word_result as a sub-key + word_result["llm_review"] = { + "changes": result["changes"], + "model_used": result["model_used"], + "duration_ms": result["duration_ms"], + "entries_corrected": result["entries_corrected"], + } + await update_session_db(session_id, word_result=word_result, current_step=6) + + if session_id in _cache: + _cache[session_id]["word_result"] = word_result + + logger.info(f"LLM review session {session_id}: {len(result['changes'])} changes, " + f"{result['duration_ms']}ms, model={result['model_used']}") + + return { + "session_id": session_id, + "changes": result["changes"], + "model_used": result["model_used"], + "duration_ms": result["duration_ms"], + "total_entries": len(entries), + "corrections_found": len(result["changes"]), + } + + +@router.post("/sessions/{session_id}/llm-review/apply") +async def apply_llm_corrections(session_id: str, request: Request): + """Apply selected LLM corrections to vocab entries.""" + session = await get_session_db(session_id) + if not session: + raise HTTPException(status_code=404, detail=f"Session {session_id} not found") + + word_result = session.get("word_result") + if not word_result: + raise HTTPException(status_code=400, detail="No word result found") + + llm_review = word_result.get("llm_review") + if not llm_review: + raise HTTPException(status_code=400, detail="No LLM review found โ€” run /llm-review first") + + body = await request.json() + accepted_indices = set(body.get("accepted_indices", [])) # indices into changes[] + + changes = llm_review.get("changes", []) + entries = word_result.get("vocab_entries") or word_result.get("entries") or [] + + # Build a lookup: (row_index, field) -> new_value for accepted changes + corrections = {} + applied_count = 0 + for idx, change in enumerate(changes): + if idx in accepted_indices: + key = (change["row_index"], change["field"]) + corrections[key] = change["new"] + applied_count += 1 + + # Apply corrections to entries + for entry in entries: + row_idx = entry.get("row_index", -1) + for field_name in ("english", "german", "example"): + key = (row_idx, field_name) + if key in corrections: + entry[field_name] = corrections[key] + entry["llm_corrected"] = True + + # Update word_result + word_result["vocab_entries"] = entries + word_result["entries"] = entries + word_result["llm_review"]["applied_count"] = applied_count + word_result["llm_review"]["applied_at"] = datetime.utcnow().isoformat() + + await update_session_db(session_id, word_result=word_result) + + if session_id in _cache: + _cache[session_id]["word_result"] = word_result + + logger.info(f"Applied {applied_count}/{len(changes)} LLM corrections for session {session_id}") + + return { + "session_id": session_id, + "applied_count": applied_count, + "total_changes": len(changes), + } + + async def _get_rows_overlay(session_id: str) -> Response: """Generate dewarped image with row bands drawn on it.""" session = await get_session_db(session_id)