Files
breakpilot-lehrer/klausur-service/backend/cv_review_llm.py
Benjamin Admin b2a0126f14 [split-required] Split remaining Python monoliths (Phase 1 continued)
klausur-service (7 monoliths):
- grid_editor_helpers.py (1,737 → 5 files: columns, filters, headers, zones)
- cv_cell_grid.py (1,675 → 7 files: build, legacy, streaming, merge, vocab)
- worksheet_editor_api.py (1,305 → 4 files: models, AI, reconstruct, routes)
- legal_corpus_ingestion.py (1,280 → 3 files: registry, chunking, ingestion)
- cv_review.py (1,248 → 4 files: pipeline, spell, LLM, barrel)
- cv_preprocessing.py (1,166 → 3 files: deskew, dewarp, barrel)
- rbac.py, admin_api.py, routes/eh.py remain (next batch)

backend-lehrer (1 monolith):
- classroom_engine/repository.py (1,705 → 7 files by domain)

All re-export barrels preserve backward compatibility.
Zero import errors verified.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-24 22:47:59 +02:00

389 lines
14 KiB
Python

"""
CV Review LLM — LLM-based OCR correction: prompt building, change detection, streaming.
Handles the LLM review path (REVIEW_ENGINE=llm) and shared utilities like
_entry_needs_review, _is_spurious_change, _diff_batch, and JSON parsing.
Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import json
import logging
import os
import re
import time
from typing import Dict, List, Tuple
import httpx
logger = logging.getLogger(__name__)
_OLLAMA_URL = os.getenv("OLLAMA_BASE_URL", "http://host.docker.internal:11434")
OLLAMA_REVIEW_MODEL = os.getenv("OLLAMA_REVIEW_MODEL", "qwen3:0.6b")
_REVIEW_BATCH_SIZE = int(os.getenv("OLLAMA_REVIEW_BATCH_SIZE", "20"))
logger.info("LLM review model: %s (batch=%d)", OLLAMA_REVIEW_MODEL, _REVIEW_BATCH_SIZE)
REVIEW_ENGINE = os.getenv("REVIEW_ENGINE", "spell") # "spell" (default) | "llm"
# Regex: entry contains IPA phonetic brackets like "dance [da:ns]"
_HAS_PHONETIC_RE = re.compile(r'\[.*?[\u02c8\u02cc\u02d0\u0283\u0292\u03b8\u00f0\u014b\u0251\u0252\u0254\u0259\u025c\u026a\u028a\u028c\u00e6].*?\]')
# Regex: digit adjacent to a letter -- OCR digit<->letter confusion
_OCR_DIGIT_IN_WORD_RE = re.compile(r'(?<=[A-Za-z\u00c4\u00d6\u00dc\u00e4\u00f6\u00fc\u00df])[01568]|[01568](?=[A-Za-z\u00c4\u00d6\u00dc\u00e4\u00f6\u00fc\u00df])')
def _entry_needs_review(entry: Dict) -> bool:
"""Check if an entry should be sent for review.
Sends all non-empty entries that don't have IPA phonetic transcriptions.
"""
en = entry.get("english", "") or ""
de = entry.get("german", "") or ""
if not en.strip() and not de.strip():
return False
if _HAS_PHONETIC_RE.search(en) or _HAS_PHONETIC_RE.search(de):
return False
return True
def _build_llm_prompt(table_lines: List[Dict]) -> str:
"""Build the LLM correction prompt for a batch of entries."""
return f"""Du bist ein OCR-Zeichenkorrektur-Werkzeug fuer Vokabeltabellen (Englisch-Deutsch).
DEINE EINZIGE AUFGABE: Einzelne Zeichen korrigieren, die vom OCR-Scanner als Ziffer statt als Buchstabe erkannt wurden.
NUR diese Korrekturen sind erlaubt:
- Ziffer 8 statt B: "8en" -> "Ben", "8uch" -> "Buch", "8all" -> "Ball"
- Ziffer 0 statt O oder o: "L0ndon" -> "London", "0ld" -> "Old"
- Ziffer 1 statt l oder I: "1ong" -> "long", "Ber1in" -> "Berlin"
- Ziffer 5 statt S oder s: "5tadt" -> "Stadt", "5ee" -> "See"
- Ziffer 6 statt G oder g: "6eld" -> "Geld"
- Senkrechter Strich | statt I oder l: "| want" -> "I want", "|ong" -> "long", "he| p" -> "help"
ABSOLUT VERBOTEN -- aendere NIEMALS:
- Woerter die korrekt geschrieben sind -- auch wenn du eine andere Schreibweise kennst
- Uebersetzungen -- du uebersetzt NICHTS, weder EN->DE noch DE->EN
- Korrekte englische Woerter (en-Spalte) -- auch wenn du eine Bedeutung kennst
- Korrekte deutsche Woerter (de-Spalte) -- auch wenn du sie anders sagen wuerdest
- Eigennamen: Ben, London, China, Africa, Shakespeare usw.
- Abkuerzungen: sth., sb., etc., e.g., i.e., v.t., smb. usw.
- Lautschrift in eckigen Klammern [...] -- diese NIEMALS beruehren
- Beispielsaetze in der ex-Spalte -- NIEMALS aendern
Wenn ein Wort keinen Ziffer-Buchstaben-Fehler enthaelt: gib es UNVERAENDERT zurueck und setze "corrected": false.
Antworte NUR mit dem JSON-Array. Kein Text davor oder danach.
Behalte die exakte Struktur (gleiche Anzahl Eintraege, gleiche Reihenfolge).
/no_think
Eingabe:
{json.dumps(table_lines, ensure_ascii=False, indent=2)}"""
def _is_spurious_change(old_val: str, new_val: str) -> bool:
"""Detect LLM changes that are likely wrong and should be discarded.
Only digit<->letter substitutions (0->O, 1->l, 5->S, 6->G, 8->B) are
legitimate OCR corrections. Everything else is rejected.
"""
if not old_val or not new_val:
return False
if old_val.lower() == new_val.lower():
return True
old_words = old_val.split()
new_words = new_val.split()
if abs(len(old_words) - len(new_words)) > 1:
return True
_OCR_CHAR_MAP = {
'0': set('oOgG'),
'1': set('lLiI'),
'5': set('sS'),
'6': set('gG'),
'8': set('bB'),
'|': set('lLiI1'),
'l': set('iI|1'),
}
has_valid_fix = False
if len(old_val) == len(new_val):
for oc, nc in zip(old_val, new_val):
if oc != nc:
if oc in _OCR_CHAR_MAP and nc in _OCR_CHAR_MAP[oc]:
has_valid_fix = True
elif nc in _OCR_CHAR_MAP and oc in _OCR_CHAR_MAP[nc]:
has_valid_fix = True
else:
_OCR_SUSPICIOUS_RE = re.compile(r'[|01568]')
if abs(len(old_val) - len(new_val)) <= 1 and _OCR_SUSPICIOUS_RE.search(old_val):
has_valid_fix = True
if not has_valid_fix:
return True
return False
def _diff_batch(originals: List[Dict], corrected: List[Dict]) -> Tuple[List[Dict], List[Dict]]:
"""Compare original entries with LLM-corrected ones, return (changes, corrected_entries)."""
changes = []
entries_out = []
for i, orig in enumerate(originals):
if i < len(corrected):
c = corrected[i]
entry = dict(orig)
for field_name, key in [("english", "en"), ("german", "de"), ("example", "ex")]:
new_val = c.get(key, "").strip()
old_val = (orig.get(field_name, "") or "").strip()
if new_val and new_val != old_val:
if _is_spurious_change(old_val, new_val):
continue
changes.append({
"row_index": orig.get("row_index", i),
"field": field_name,
"old": old_val,
"new": new_val,
})
entry[field_name] = new_val
entry["llm_corrected"] = True
entries_out.append(entry)
else:
entries_out.append(dict(orig))
return changes, entries_out
def _sanitize_for_json(text: str) -> str:
"""Remove or escape control characters that break JSON parsing."""
return re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', ' ', text)
def _parse_llm_json_array(text: str) -> List[Dict]:
"""Extract JSON array from LLM response (handles markdown fences and qwen3 think-tags)."""
text = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL)
text = re.sub(r'```json\s*', '', text)
text = re.sub(r'```\s*', '', text)
text = _sanitize_for_json(text)
match = re.search(r'\[.*\]', text, re.DOTALL)
if match:
try:
return json.loads(match.group())
except (ValueError, json.JSONDecodeError) as e:
logger.warning("LLM review: JSON parse failed: %s | raw snippet: %.200s", e, match.group()[:200])
else:
logger.warning("LLM review: no JSON array found in response (%.200s)", text[:200])
return []
async def llm_review_entries(
entries: List[Dict],
model: str = None,
) -> Dict:
"""OCR error correction. Uses spell-checker (REVIEW_ENGINE=spell) or LLM (REVIEW_ENGINE=llm)."""
from cv_review_spell import spell_review_entries_sync, _SPELL_AVAILABLE
if REVIEW_ENGINE == "spell" and _SPELL_AVAILABLE:
return spell_review_entries_sync(entries)
if REVIEW_ENGINE == "spell" and not _SPELL_AVAILABLE:
logger.warning("REVIEW_ENGINE=spell but pyspellchecker not installed, using LLM")
model = model or OLLAMA_REVIEW_MODEL
reviewable = [(i, e) for i, e in enumerate(entries) if _entry_needs_review(e)]
if not reviewable:
return {
"entries_original": entries,
"entries_corrected": [dict(e) for e in entries],
"changes": [],
"skipped_count": len(entries),
"model_used": model,
"duration_ms": 0,
}
review_entries = [e for _, e in reviewable]
table_lines = [
{"row": e.get("row_index", 0), "en": e.get("english", ""), "de": e.get("german", ""), "ex": e.get("example", "")}
for e in review_entries
]
logger.info("LLM review: sending %d/%d entries to %s (skipped %d without digit-pattern)",
len(review_entries), len(entries), model, len(entries) - len(reviewable))
prompt = _build_llm_prompt(table_lines)
t0 = time.time()
async with httpx.AsyncClient(timeout=300.0) as client:
resp = await client.post(
f"{_OLLAMA_URL}/api/chat",
json={
"model": model,
"messages": [{"role": "user", "content": prompt}],
"stream": False,
"think": False,
"options": {"temperature": 0.1, "num_predict": 8192},
},
)
resp.raise_for_status()
content = resp.json().get("message", {}).get("content", "")
duration_ms = int((time.time() - t0) * 1000)
logger.info("LLM review: response in %dms, raw length=%d chars", duration_ms, len(content))
corrected = _parse_llm_json_array(content)
changes, corrected_entries = _diff_batch(review_entries, corrected)
all_corrected = [dict(e) for e in entries]
for batch_idx, (orig_idx, _) in enumerate(reviewable):
if batch_idx < len(corrected_entries):
all_corrected[orig_idx] = corrected_entries[batch_idx]
return {
"entries_original": entries,
"entries_corrected": all_corrected,
"changes": changes,
"skipped_count": len(entries) - len(reviewable),
"model_used": model,
"duration_ms": duration_ms,
}
async def llm_review_entries_streaming(
entries: List[Dict],
model: str = None,
batch_size: int = _REVIEW_BATCH_SIZE,
):
"""Async generator: yield SSE events. Uses spell-checker or LLM depending on REVIEW_ENGINE.
Phase 0 (always): Run _fix_character_confusion and emit any changes.
"""
from cv_ocr_engines import _fix_character_confusion
from cv_review_spell import spell_review_entries_streaming, _SPELL_AVAILABLE
_CONF_FIELDS = ('english', 'german', 'example')
originals = [{f: e.get(f, '') for f in _CONF_FIELDS} for e in entries]
_fix_character_confusion(entries)
char_changes = [
{'row_index': i, 'field': f, 'old': originals[i][f], 'new': entries[i].get(f, '')}
for i in range(len(entries))
for f in _CONF_FIELDS
if originals[i][f] != entries[i].get(f, '')
]
if REVIEW_ENGINE == "spell" and _SPELL_AVAILABLE:
_meta_sent = False
async for event in spell_review_entries_streaming(entries, batch_size):
yield event
if not _meta_sent and event.get('type') == 'meta' and char_changes:
_meta_sent = True
yield {
'type': 'batch',
'changes': char_changes,
'entries_reviewed': sorted({c['row_index'] for c in char_changes}),
'progress': {'current': 0, 'total': len(entries)},
}
return
if REVIEW_ENGINE == "spell" and not _SPELL_AVAILABLE:
logger.warning("REVIEW_ENGINE=spell but pyspellchecker not installed, using LLM")
# LLM path
if char_changes:
yield {
'type': 'batch',
'changes': char_changes,
'entries_reviewed': sorted({c['row_index'] for c in char_changes}),
'progress': {'current': 0, 'total': len(entries)},
}
model = model or OLLAMA_REVIEW_MODEL
reviewable = []
skipped_indices = []
for i, e in enumerate(entries):
if _entry_needs_review(e):
reviewable.append((i, e))
else:
skipped_indices.append(i)
total_to_review = len(reviewable)
yield {
"type": "meta",
"total_entries": len(entries),
"to_review": total_to_review,
"skipped": len(skipped_indices),
"model": model,
"batch_size": batch_size,
}
all_changes = []
all_corrected = [dict(e) for e in entries]
total_duration_ms = 0
reviewed_count = 0
for batch_start in range(0, total_to_review, batch_size):
batch_items = reviewable[batch_start:batch_start + batch_size]
batch_entries = [e for _, e in batch_items]
table_lines = [
{"row": e.get("row_index", 0), "en": e.get("english", ""), "de": e.get("german", ""), "ex": e.get("example", "")}
for e in batch_entries
]
prompt = _build_llm_prompt(table_lines)
logger.info("LLM review streaming: batch %d -- sending %d entries to %s",
batch_start // batch_size, len(batch_entries), model)
t0 = time.time()
async with httpx.AsyncClient(timeout=300.0) as client:
resp = await client.post(
f"{_OLLAMA_URL}/api/chat",
json={
"model": model,
"messages": [{"role": "user", "content": prompt}],
"stream": False,
"think": False,
"options": {"temperature": 0.1, "num_predict": 8192},
},
)
resp.raise_for_status()
content = resp.json().get("message", {}).get("content", "")
batch_ms = int((time.time() - t0) * 1000)
total_duration_ms += batch_ms
corrected = _parse_llm_json_array(content)
batch_changes, batch_corrected = _diff_batch(batch_entries, corrected)
for batch_idx, (orig_idx, _) in enumerate(batch_items):
if batch_idx < len(batch_corrected):
all_corrected[orig_idx] = batch_corrected[batch_idx]
all_changes.extend(batch_changes)
reviewed_count += len(batch_items)
yield {
"type": "batch",
"batch_index": batch_start // batch_size,
"entries_reviewed": [e.get("row_index", 0) for _, e in batch_items],
"changes": batch_changes,
"duration_ms": batch_ms,
"progress": {"current": reviewed_count, "total": total_to_review},
}
yield {
"type": "complete",
"changes": all_changes,
"model_used": model,
"duration_ms": total_duration_ms,
"total_entries": len(entries),
"reviewed": total_to_review,
"skipped": len(skipped_indices),
"corrections_found": len(all_changes),
"entries_corrected": all_corrected,
}