Fix: Sidebar scrollable + add Eltern-Portal nav link
overflow-hidden → overflow-y-auto so all nav items are reachable. Added /parent (Eltern-Portal) link with people icon. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
388
klausur-service/backend/ocr/review/llm.py
Normal file
388
klausur-service/backend/ocr/review/llm.py
Normal file
@@ -0,0 +1,388 @@
|
||||
"""
|
||||
CV Review LLM — LLM-based OCR correction: prompt building, change detection, streaming.
|
||||
|
||||
Handles the LLM review path (REVIEW_ENGINE=llm) and shared utilities like
|
||||
_entry_needs_review, _is_spurious_change, _diff_batch, and JSON parsing.
|
||||
|
||||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
from typing import Dict, List, Tuple
|
||||
|
||||
import httpx
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_OLLAMA_URL = os.getenv("OLLAMA_BASE_URL", "http://host.docker.internal:11434")
|
||||
OLLAMA_REVIEW_MODEL = os.getenv("OLLAMA_REVIEW_MODEL", "qwen3:0.6b")
|
||||
_REVIEW_BATCH_SIZE = int(os.getenv("OLLAMA_REVIEW_BATCH_SIZE", "20"))
|
||||
logger.info("LLM review model: %s (batch=%d)", OLLAMA_REVIEW_MODEL, _REVIEW_BATCH_SIZE)
|
||||
|
||||
REVIEW_ENGINE = os.getenv("REVIEW_ENGINE", "spell") # "spell" (default) | "llm"
|
||||
|
||||
# Regex: entry contains IPA phonetic brackets like "dance [da:ns]"
|
||||
_HAS_PHONETIC_RE = re.compile(r'\[.*?[\u02c8\u02cc\u02d0\u0283\u0292\u03b8\u00f0\u014b\u0251\u0252\u0254\u0259\u025c\u026a\u028a\u028c\u00e6].*?\]')
|
||||
|
||||
# Regex: digit adjacent to a letter -- OCR digit<->letter confusion
|
||||
_OCR_DIGIT_IN_WORD_RE = re.compile(r'(?<=[A-Za-z\u00c4\u00d6\u00dc\u00e4\u00f6\u00fc\u00df])[01568]|[01568](?=[A-Za-z\u00c4\u00d6\u00dc\u00e4\u00f6\u00fc\u00df])')
|
||||
|
||||
|
||||
def _entry_needs_review(entry: Dict) -> bool:
|
||||
"""Check if an entry should be sent for review.
|
||||
|
||||
Sends all non-empty entries that don't have IPA phonetic transcriptions.
|
||||
"""
|
||||
en = entry.get("english", "") or ""
|
||||
de = entry.get("german", "") or ""
|
||||
|
||||
if not en.strip() and not de.strip():
|
||||
return False
|
||||
if _HAS_PHONETIC_RE.search(en) or _HAS_PHONETIC_RE.search(de):
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def _build_llm_prompt(table_lines: List[Dict]) -> str:
|
||||
"""Build the LLM correction prompt for a batch of entries."""
|
||||
return f"""Du bist ein OCR-Zeichenkorrektur-Werkzeug fuer Vokabeltabellen (Englisch-Deutsch).
|
||||
|
||||
DEINE EINZIGE AUFGABE: Einzelne Zeichen korrigieren, die vom OCR-Scanner als Ziffer statt als Buchstabe erkannt wurden.
|
||||
|
||||
NUR diese Korrekturen sind erlaubt:
|
||||
- Ziffer 8 statt B: "8en" -> "Ben", "8uch" -> "Buch", "8all" -> "Ball"
|
||||
- Ziffer 0 statt O oder o: "L0ndon" -> "London", "0ld" -> "Old"
|
||||
- Ziffer 1 statt l oder I: "1ong" -> "long", "Ber1in" -> "Berlin"
|
||||
- Ziffer 5 statt S oder s: "5tadt" -> "Stadt", "5ee" -> "See"
|
||||
- Ziffer 6 statt G oder g: "6eld" -> "Geld"
|
||||
- Senkrechter Strich | statt I oder l: "| want" -> "I want", "|ong" -> "long", "he| p" -> "help"
|
||||
|
||||
ABSOLUT VERBOTEN -- aendere NIEMALS:
|
||||
- Woerter die korrekt geschrieben sind -- auch wenn du eine andere Schreibweise kennst
|
||||
- Uebersetzungen -- du uebersetzt NICHTS, weder EN->DE noch DE->EN
|
||||
- Korrekte englische Woerter (en-Spalte) -- auch wenn du eine Bedeutung kennst
|
||||
- Korrekte deutsche Woerter (de-Spalte) -- auch wenn du sie anders sagen wuerdest
|
||||
- Eigennamen: Ben, London, China, Africa, Shakespeare usw.
|
||||
- Abkuerzungen: sth., sb., etc., e.g., i.e., v.t., smb. usw.
|
||||
- Lautschrift in eckigen Klammern [...] -- diese NIEMALS beruehren
|
||||
- Beispielsaetze in der ex-Spalte -- NIEMALS aendern
|
||||
|
||||
Wenn ein Wort keinen Ziffer-Buchstaben-Fehler enthaelt: gib es UNVERAENDERT zurueck und setze "corrected": false.
|
||||
|
||||
Antworte NUR mit dem JSON-Array. Kein Text davor oder danach.
|
||||
Behalte die exakte Struktur (gleiche Anzahl Eintraege, gleiche Reihenfolge).
|
||||
|
||||
/no_think
|
||||
|
||||
Eingabe:
|
||||
{json.dumps(table_lines, ensure_ascii=False, indent=2)}"""
|
||||
|
||||
|
||||
def _is_spurious_change(old_val: str, new_val: str) -> bool:
|
||||
"""Detect LLM changes that are likely wrong and should be discarded.
|
||||
|
||||
Only digit<->letter substitutions (0->O, 1->l, 5->S, 6->G, 8->B) are
|
||||
legitimate OCR corrections. Everything else is rejected.
|
||||
"""
|
||||
if not old_val or not new_val:
|
||||
return False
|
||||
|
||||
if old_val.lower() == new_val.lower():
|
||||
return True
|
||||
|
||||
old_words = old_val.split()
|
||||
new_words = new_val.split()
|
||||
if abs(len(old_words) - len(new_words)) > 1:
|
||||
return True
|
||||
|
||||
_OCR_CHAR_MAP = {
|
||||
'0': set('oOgG'),
|
||||
'1': set('lLiI'),
|
||||
'5': set('sS'),
|
||||
'6': set('gG'),
|
||||
'8': set('bB'),
|
||||
'|': set('lLiI1'),
|
||||
'l': set('iI|1'),
|
||||
}
|
||||
has_valid_fix = False
|
||||
if len(old_val) == len(new_val):
|
||||
for oc, nc in zip(old_val, new_val):
|
||||
if oc != nc:
|
||||
if oc in _OCR_CHAR_MAP and nc in _OCR_CHAR_MAP[oc]:
|
||||
has_valid_fix = True
|
||||
elif nc in _OCR_CHAR_MAP and oc in _OCR_CHAR_MAP[nc]:
|
||||
has_valid_fix = True
|
||||
else:
|
||||
_OCR_SUSPICIOUS_RE = re.compile(r'[|01568]')
|
||||
if abs(len(old_val) - len(new_val)) <= 1 and _OCR_SUSPICIOUS_RE.search(old_val):
|
||||
has_valid_fix = True
|
||||
|
||||
if not has_valid_fix:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def _diff_batch(originals: List[Dict], corrected: List[Dict]) -> Tuple[List[Dict], List[Dict]]:
|
||||
"""Compare original entries with LLM-corrected ones, return (changes, corrected_entries)."""
|
||||
changes = []
|
||||
entries_out = []
|
||||
for i, orig in enumerate(originals):
|
||||
if i < len(corrected):
|
||||
c = corrected[i]
|
||||
entry = dict(orig)
|
||||
for field_name, key in [("english", "en"), ("german", "de"), ("example", "ex")]:
|
||||
new_val = c.get(key, "").strip()
|
||||
old_val = (orig.get(field_name, "") or "").strip()
|
||||
if new_val and new_val != old_val:
|
||||
if _is_spurious_change(old_val, new_val):
|
||||
continue
|
||||
changes.append({
|
||||
"row_index": orig.get("row_index", i),
|
||||
"field": field_name,
|
||||
"old": old_val,
|
||||
"new": new_val,
|
||||
})
|
||||
entry[field_name] = new_val
|
||||
entry["llm_corrected"] = True
|
||||
entries_out.append(entry)
|
||||
else:
|
||||
entries_out.append(dict(orig))
|
||||
return changes, entries_out
|
||||
|
||||
|
||||
def _sanitize_for_json(text: str) -> str:
|
||||
"""Remove or escape control characters that break JSON parsing."""
|
||||
return re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', ' ', text)
|
||||
|
||||
|
||||
def _parse_llm_json_array(text: str) -> List[Dict]:
|
||||
"""Extract JSON array from LLM response (handles markdown fences and qwen3 think-tags)."""
|
||||
text = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL)
|
||||
text = re.sub(r'```json\s*', '', text)
|
||||
text = re.sub(r'```\s*', '', text)
|
||||
text = _sanitize_for_json(text)
|
||||
match = re.search(r'\[.*\]', text, re.DOTALL)
|
||||
if match:
|
||||
try:
|
||||
return json.loads(match.group())
|
||||
except (ValueError, json.JSONDecodeError) as e:
|
||||
logger.warning("LLM review: JSON parse failed: %s | raw snippet: %.200s", e, match.group()[:200])
|
||||
else:
|
||||
logger.warning("LLM review: no JSON array found in response (%.200s)", text[:200])
|
||||
return []
|
||||
|
||||
|
||||
async def llm_review_entries(
|
||||
entries: List[Dict],
|
||||
model: str = None,
|
||||
) -> Dict:
|
||||
"""OCR error correction. Uses spell-checker (REVIEW_ENGINE=spell) or LLM (REVIEW_ENGINE=llm)."""
|
||||
from cv_review_spell import spell_review_entries_sync, _SPELL_AVAILABLE
|
||||
|
||||
if REVIEW_ENGINE == "spell" and _SPELL_AVAILABLE:
|
||||
return spell_review_entries_sync(entries)
|
||||
if REVIEW_ENGINE == "spell" and not _SPELL_AVAILABLE:
|
||||
logger.warning("REVIEW_ENGINE=spell but pyspellchecker not installed, using LLM")
|
||||
|
||||
model = model or OLLAMA_REVIEW_MODEL
|
||||
|
||||
reviewable = [(i, e) for i, e in enumerate(entries) if _entry_needs_review(e)]
|
||||
|
||||
if not reviewable:
|
||||
return {
|
||||
"entries_original": entries,
|
||||
"entries_corrected": [dict(e) for e in entries],
|
||||
"changes": [],
|
||||
"skipped_count": len(entries),
|
||||
"model_used": model,
|
||||
"duration_ms": 0,
|
||||
}
|
||||
|
||||
review_entries = [e for _, e in reviewable]
|
||||
table_lines = [
|
||||
{"row": e.get("row_index", 0), "en": e.get("english", ""), "de": e.get("german", ""), "ex": e.get("example", "")}
|
||||
for e in review_entries
|
||||
]
|
||||
|
||||
logger.info("LLM review: sending %d/%d entries to %s (skipped %d without digit-pattern)",
|
||||
len(review_entries), len(entries), model, len(entries) - len(reviewable))
|
||||
|
||||
prompt = _build_llm_prompt(table_lines)
|
||||
|
||||
t0 = time.time()
|
||||
async with httpx.AsyncClient(timeout=300.0) as client:
|
||||
resp = await client.post(
|
||||
f"{_OLLAMA_URL}/api/chat",
|
||||
json={
|
||||
"model": model,
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"stream": False,
|
||||
"think": False,
|
||||
"options": {"temperature": 0.1, "num_predict": 8192},
|
||||
},
|
||||
)
|
||||
resp.raise_for_status()
|
||||
content = resp.json().get("message", {}).get("content", "")
|
||||
duration_ms = int((time.time() - t0) * 1000)
|
||||
|
||||
logger.info("LLM review: response in %dms, raw length=%d chars", duration_ms, len(content))
|
||||
|
||||
corrected = _parse_llm_json_array(content)
|
||||
changes, corrected_entries = _diff_batch(review_entries, corrected)
|
||||
|
||||
all_corrected = [dict(e) for e in entries]
|
||||
for batch_idx, (orig_idx, _) in enumerate(reviewable):
|
||||
if batch_idx < len(corrected_entries):
|
||||
all_corrected[orig_idx] = corrected_entries[batch_idx]
|
||||
|
||||
return {
|
||||
"entries_original": entries,
|
||||
"entries_corrected": all_corrected,
|
||||
"changes": changes,
|
||||
"skipped_count": len(entries) - len(reviewable),
|
||||
"model_used": model,
|
||||
"duration_ms": duration_ms,
|
||||
}
|
||||
|
||||
|
||||
async def llm_review_entries_streaming(
|
||||
entries: List[Dict],
|
||||
model: str = None,
|
||||
batch_size: int = _REVIEW_BATCH_SIZE,
|
||||
):
|
||||
"""Async generator: yield SSE events. Uses spell-checker or LLM depending on REVIEW_ENGINE.
|
||||
|
||||
Phase 0 (always): Run _fix_character_confusion and emit any changes.
|
||||
"""
|
||||
from cv_ocr_engines import _fix_character_confusion
|
||||
from cv_review_spell import spell_review_entries_streaming, _SPELL_AVAILABLE
|
||||
|
||||
_CONF_FIELDS = ('english', 'german', 'example')
|
||||
originals = [{f: e.get(f, '') for f in _CONF_FIELDS} for e in entries]
|
||||
_fix_character_confusion(entries)
|
||||
char_changes = [
|
||||
{'row_index': i, 'field': f, 'old': originals[i][f], 'new': entries[i].get(f, '')}
|
||||
for i in range(len(entries))
|
||||
for f in _CONF_FIELDS
|
||||
if originals[i][f] != entries[i].get(f, '')
|
||||
]
|
||||
|
||||
if REVIEW_ENGINE == "spell" and _SPELL_AVAILABLE:
|
||||
_meta_sent = False
|
||||
async for event in spell_review_entries_streaming(entries, batch_size):
|
||||
yield event
|
||||
if not _meta_sent and event.get('type') == 'meta' and char_changes:
|
||||
_meta_sent = True
|
||||
yield {
|
||||
'type': 'batch',
|
||||
'changes': char_changes,
|
||||
'entries_reviewed': sorted({c['row_index'] for c in char_changes}),
|
||||
'progress': {'current': 0, 'total': len(entries)},
|
||||
}
|
||||
return
|
||||
|
||||
if REVIEW_ENGINE == "spell" and not _SPELL_AVAILABLE:
|
||||
logger.warning("REVIEW_ENGINE=spell but pyspellchecker not installed, using LLM")
|
||||
|
||||
# LLM path
|
||||
if char_changes:
|
||||
yield {
|
||||
'type': 'batch',
|
||||
'changes': char_changes,
|
||||
'entries_reviewed': sorted({c['row_index'] for c in char_changes}),
|
||||
'progress': {'current': 0, 'total': len(entries)},
|
||||
}
|
||||
|
||||
model = model or OLLAMA_REVIEW_MODEL
|
||||
|
||||
reviewable = []
|
||||
skipped_indices = []
|
||||
for i, e in enumerate(entries):
|
||||
if _entry_needs_review(e):
|
||||
reviewable.append((i, e))
|
||||
else:
|
||||
skipped_indices.append(i)
|
||||
|
||||
total_to_review = len(reviewable)
|
||||
|
||||
yield {
|
||||
"type": "meta",
|
||||
"total_entries": len(entries),
|
||||
"to_review": total_to_review,
|
||||
"skipped": len(skipped_indices),
|
||||
"model": model,
|
||||
"batch_size": batch_size,
|
||||
}
|
||||
|
||||
all_changes = []
|
||||
all_corrected = [dict(e) for e in entries]
|
||||
total_duration_ms = 0
|
||||
reviewed_count = 0
|
||||
|
||||
for batch_start in range(0, total_to_review, batch_size):
|
||||
batch_items = reviewable[batch_start:batch_start + batch_size]
|
||||
batch_entries = [e for _, e in batch_items]
|
||||
|
||||
table_lines = [
|
||||
{"row": e.get("row_index", 0), "en": e.get("english", ""), "de": e.get("german", ""), "ex": e.get("example", "")}
|
||||
for e in batch_entries
|
||||
]
|
||||
|
||||
prompt = _build_llm_prompt(table_lines)
|
||||
|
||||
logger.info("LLM review streaming: batch %d -- sending %d entries to %s",
|
||||
batch_start // batch_size, len(batch_entries), model)
|
||||
|
||||
t0 = time.time()
|
||||
async with httpx.AsyncClient(timeout=300.0) as client:
|
||||
resp = await client.post(
|
||||
f"{_OLLAMA_URL}/api/chat",
|
||||
json={
|
||||
"model": model,
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"stream": False,
|
||||
"think": False,
|
||||
"options": {"temperature": 0.1, "num_predict": 8192},
|
||||
},
|
||||
)
|
||||
resp.raise_for_status()
|
||||
content = resp.json().get("message", {}).get("content", "")
|
||||
batch_ms = int((time.time() - t0) * 1000)
|
||||
total_duration_ms += batch_ms
|
||||
|
||||
corrected = _parse_llm_json_array(content)
|
||||
batch_changes, batch_corrected = _diff_batch(batch_entries, corrected)
|
||||
|
||||
for batch_idx, (orig_idx, _) in enumerate(batch_items):
|
||||
if batch_idx < len(batch_corrected):
|
||||
all_corrected[orig_idx] = batch_corrected[batch_idx]
|
||||
|
||||
all_changes.extend(batch_changes)
|
||||
reviewed_count += len(batch_items)
|
||||
|
||||
yield {
|
||||
"type": "batch",
|
||||
"batch_index": batch_start // batch_size,
|
||||
"entries_reviewed": [e.get("row_index", 0) for _, e in batch_items],
|
||||
"changes": batch_changes,
|
||||
"duration_ms": batch_ms,
|
||||
"progress": {"current": reviewed_count, "total": total_to_review},
|
||||
}
|
||||
|
||||
yield {
|
||||
"type": "complete",
|
||||
"changes": all_changes,
|
||||
"model_used": model,
|
||||
"duration_ms": total_duration_ms,
|
||||
"total_entries": len(entries),
|
||||
"reviewed": total_to_review,
|
||||
"skipped": len(skipped_indices),
|
||||
"corrections_found": len(all_changes),
|
||||
"entries_corrected": all_corrected,
|
||||
}
|
||||
Reference in New Issue
Block a user