breakpilot-lehrer/klausur-service/backend/cv_review_llm.py

"""
CV Review LLM — LLM-based OCR correction: prompt building, change detection, streaming.

Handles the LLM review path (REVIEW_ENGINE=llm) and shared utilities like
_entry_needs_review, _is_spurious_change, _diff_batch, and JSON parsing.

Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""

import json
import logging
import os
import re
import time
from typing import Dict, List, Tuple

import httpx

logger = logging.getLogger(__name__)

_OLLAMA_URL = os.getenv("OLLAMA_BASE_URL", "http://host.docker.internal:11434")
OLLAMA_REVIEW_MODEL = os.getenv("OLLAMA_REVIEW_MODEL", "qwen3:0.6b")
_REVIEW_BATCH_SIZE = int(os.getenv("OLLAMA_REVIEW_BATCH_SIZE", "20"))
logger.info("LLM review model: %s (batch=%d)", OLLAMA_REVIEW_MODEL, _REVIEW_BATCH_SIZE)

REVIEW_ENGINE = os.getenv("REVIEW_ENGINE", "spell")   # "spell" (default) | "llm"

# Regex: entry contains IPA phonetic brackets like "dance [da:ns]"
_HAS_PHONETIC_RE = re.compile(r'\[.*?[\u02c8\u02cc\u02d0\u0283\u0292\u03b8\u00f0\u014b\u0251\u0252\u0254\u0259\u025c\u026a\u028a\u028c\u00e6].*?\]')

# Regex: digit adjacent to a letter -- OCR digit<->letter confusion
_OCR_DIGIT_IN_WORD_RE = re.compile(r'(?<=[A-Za-z\u00c4\u00d6\u00dc\u00e4\u00f6\u00fc\u00df])[01568]|[01568](?=[A-Za-z\u00c4\u00d6\u00dc\u00e4\u00f6\u00fc\u00df])')


def _entry_needs_review(entry: Dict) -> bool:
    """Check if an entry should be sent for review.

    Sends all non-empty entries that don't have IPA phonetic transcriptions.
    """
    en = entry.get("english", "") or ""
    de = entry.get("german", "") or ""

    if not en.strip() and not de.strip():
        return False
    if _HAS_PHONETIC_RE.search(en) or _HAS_PHONETIC_RE.search(de):
        return False
    return True


def _build_llm_prompt(table_lines: List[Dict]) -> str:
    """Build the LLM correction prompt for a batch of entries."""
    return f"""Du bist ein OCR-Zeichenkorrektur-Werkzeug fuer Vokabeltabellen (Englisch-Deutsch).

DEINE EINZIGE AUFGABE: Einzelne Zeichen korrigieren, die vom OCR-Scanner als Ziffer statt als Buchstabe erkannt wurden.

NUR diese Korrekturen sind erlaubt:
- Ziffer 8 statt B: "8en" -> "Ben", "8uch" -> "Buch", "8all" -> "Ball"
- Ziffer 0 statt O oder o: "L0ndon" -> "London", "0ld" -> "Old"
- Ziffer 1 statt l oder I: "1ong" -> "long", "Ber1in" -> "Berlin"
- Ziffer 5 statt S oder s: "5tadt" -> "Stadt", "5ee" -> "See"
- Ziffer 6 statt G oder g: "6eld" -> "Geld"
- Senkrechter Strich | statt I oder l: "| want" -> "I want", "|ong" -> "long", "he| p" -> "help"

ABSOLUT VERBOTEN -- aendere NIEMALS:
- Woerter die korrekt geschrieben sind -- auch wenn du eine andere Schreibweise kennst
- Uebersetzungen -- du uebersetzt NICHTS, weder EN->DE noch DE->EN
- Korrekte englische Woerter (en-Spalte) -- auch wenn du eine Bedeutung kennst
- Korrekte deutsche Woerter (de-Spalte) -- auch wenn du sie anders sagen wuerdest
- Eigennamen: Ben, London, China, Africa, Shakespeare usw.
- Abkuerzungen: sth., sb., etc., e.g., i.e., v.t., smb. usw.
- Lautschrift in eckigen Klammern [...] -- diese NIEMALS beruehren
- Beispielsaetze in der ex-Spalte -- NIEMALS aendern

Wenn ein Wort keinen Ziffer-Buchstaben-Fehler enthaelt: gib es UNVERAENDERT zurueck und setze "corrected": false.

Antworte NUR mit dem JSON-Array. Kein Text davor oder danach.
Behalte die exakte Struktur (gleiche Anzahl Eintraege, gleiche Reihenfolge).

/no_think

Eingabe:
{json.dumps(table_lines, ensure_ascii=False, indent=2)}"""


def _is_spurious_change(old_val: str, new_val: str) -> bool:
    """Detect LLM changes that are likely wrong and should be discarded.

    Only digit<->letter substitutions (0->O, 1->l, 5->S, 6->G, 8->B) are
    legitimate OCR corrections. Everything else is rejected.
    """
    if not old_val or not new_val:
        return False

    if old_val.lower() == new_val.lower():
        return True

    old_words = old_val.split()
    new_words = new_val.split()
    if abs(len(old_words) - len(new_words)) > 1:
        return True

    _OCR_CHAR_MAP = {
        '0': set('oOgG'),
        '1': set('lLiI'),
        '5': set('sS'),
        '6': set('gG'),
        '8': set('bB'),
        '|': set('lLiI1'),
        'l': set('iI|1'),
    }
    has_valid_fix = False
    if len(old_val) == len(new_val):
        for oc, nc in zip(old_val, new_val):
            if oc != nc:
                if oc in _OCR_CHAR_MAP and nc in _OCR_CHAR_MAP[oc]:
                    has_valid_fix = True
                elif nc in _OCR_CHAR_MAP and oc in _OCR_CHAR_MAP[nc]:
                    has_valid_fix = True
    else:
        _OCR_SUSPICIOUS_RE = re.compile(r'[|01568]')
        if abs(len(old_val) - len(new_val)) <= 1 and _OCR_SUSPICIOUS_RE.search(old_val):
            has_valid_fix = True

    if not has_valid_fix:
        return True

    return False


def _diff_batch(originals: List[Dict], corrected: List[Dict]) -> Tuple[List[Dict], List[Dict]]:
    """Compare original entries with LLM-corrected ones, return (changes, corrected_entries)."""
    changes = []
    entries_out = []
    for i, orig in enumerate(originals):
        if i < len(corrected):
            c = corrected[i]
            entry = dict(orig)
            for field_name, key in [("english", "en"), ("german", "de"), ("example", "ex")]:
                new_val = c.get(key, "").strip()
                old_val = (orig.get(field_name, "") or "").strip()
                if new_val and new_val != old_val:
                    if _is_spurious_change(old_val, new_val):
                        continue
                    changes.append({
                        "row_index": orig.get("row_index", i),
                        "field": field_name,
                        "old": old_val,
                        "new": new_val,
                    })
                    entry[field_name] = new_val
                    entry["llm_corrected"] = True
            entries_out.append(entry)
        else:
            entries_out.append(dict(orig))
    return changes, entries_out


def _sanitize_for_json(text: str) -> str:
    """Remove or escape control characters that break JSON parsing."""
    return re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', ' ', text)


def _parse_llm_json_array(text: str) -> List[Dict]:
    """Extract JSON array from LLM response (handles markdown fences and qwen3 think-tags)."""
    text = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL)
    text = re.sub(r'```json\s*', '', text)
    text = re.sub(r'```\s*', '', text)
    text = _sanitize_for_json(text)
    match = re.search(r'\[.*\]', text, re.DOTALL)
    if match:
        try:
            return json.loads(match.group())
        except (ValueError, json.JSONDecodeError) as e:
            logger.warning("LLM review: JSON parse failed: %s | raw snippet: %.200s", e, match.group()[:200])
    else:
        logger.warning("LLM review: no JSON array found in response (%.200s)", text[:200])
    return []


async def llm_review_entries(
    entries: List[Dict],
    model: str = None,
) -> Dict:
    """OCR error correction. Uses spell-checker (REVIEW_ENGINE=spell) or LLM (REVIEW_ENGINE=llm)."""
    from cv_review_spell import spell_review_entries_sync, _SPELL_AVAILABLE

    if REVIEW_ENGINE == "spell" and _SPELL_AVAILABLE:
        return spell_review_entries_sync(entries)
    if REVIEW_ENGINE == "spell" and not _SPELL_AVAILABLE:
        logger.warning("REVIEW_ENGINE=spell but pyspellchecker not installed, using LLM")

    model = model or OLLAMA_REVIEW_MODEL

    reviewable = [(i, e) for i, e in enumerate(entries) if _entry_needs_review(e)]

    if not reviewable:
        return {
            "entries_original": entries,
            "entries_corrected": [dict(e) for e in entries],
            "changes": [],
            "skipped_count": len(entries),
            "model_used": model,
            "duration_ms": 0,
        }

    review_entries = [e for _, e in reviewable]
    table_lines = [
        {"row": e.get("row_index", 0), "en": e.get("english", ""), "de": e.get("german", ""), "ex": e.get("example", "")}
        for e in review_entries
    ]

    logger.info("LLM review: sending %d/%d entries to %s (skipped %d without digit-pattern)",
                len(review_entries), len(entries), model, len(entries) - len(reviewable))

    prompt = _build_llm_prompt(table_lines)

    t0 = time.time()
    async with httpx.AsyncClient(timeout=300.0) as client:
        resp = await client.post(
            f"{_OLLAMA_URL}/api/chat",
            json={
                "model": model,
                "messages": [{"role": "user", "content": prompt}],
                "stream": False,
                "think": False,
                "options": {"temperature": 0.1, "num_predict": 8192},
            },
        )
        resp.raise_for_status()
        content = resp.json().get("message", {}).get("content", "")
    duration_ms = int((time.time() - t0) * 1000)

    logger.info("LLM review: response in %dms, raw length=%d chars", duration_ms, len(content))

    corrected = _parse_llm_json_array(content)
    changes, corrected_entries = _diff_batch(review_entries, corrected)

    all_corrected = [dict(e) for e in entries]
    for batch_idx, (orig_idx, _) in enumerate(reviewable):
        if batch_idx < len(corrected_entries):
            all_corrected[orig_idx] = corrected_entries[batch_idx]

    return {
        "entries_original": entries,
        "entries_corrected": all_corrected,
        "changes": changes,
        "skipped_count": len(entries) - len(reviewable),
        "model_used": model,
        "duration_ms": duration_ms,
    }


async def llm_review_entries_streaming(
    entries: List[Dict],
    model: str = None,
    batch_size: int = _REVIEW_BATCH_SIZE,
):
    """Async generator: yield SSE events. Uses spell-checker or LLM depending on REVIEW_ENGINE.

    Phase 0 (always): Run _fix_character_confusion and emit any changes.
    """
    from cv_ocr_engines import _fix_character_confusion
    from cv_review_spell import spell_review_entries_streaming, _SPELL_AVAILABLE

    _CONF_FIELDS = ('english', 'german', 'example')
    originals = [{f: e.get(f, '') for f in _CONF_FIELDS} for e in entries]
    _fix_character_confusion(entries)
    char_changes = [
        {'row_index': i, 'field': f, 'old': originals[i][f], 'new': entries[i].get(f, '')}
        for i in range(len(entries))
        for f in _CONF_FIELDS
        if originals[i][f] != entries[i].get(f, '')
    ]

    if REVIEW_ENGINE == "spell" and _SPELL_AVAILABLE:
        _meta_sent = False
        async for event in spell_review_entries_streaming(entries, batch_size):
            yield event
            if not _meta_sent and event.get('type') == 'meta' and char_changes:
                _meta_sent = True
                yield {
                    'type': 'batch',
                    'changes': char_changes,
                    'entries_reviewed': sorted({c['row_index'] for c in char_changes}),
                    'progress': {'current': 0, 'total': len(entries)},
                }
        return

    if REVIEW_ENGINE == "spell" and not _SPELL_AVAILABLE:
        logger.warning("REVIEW_ENGINE=spell but pyspellchecker not installed, using LLM")

    # LLM path
    if char_changes:
        yield {
            'type': 'batch',
            'changes': char_changes,
            'entries_reviewed': sorted({c['row_index'] for c in char_changes}),
            'progress': {'current': 0, 'total': len(entries)},
        }

    model = model or OLLAMA_REVIEW_MODEL

    reviewable = []
    skipped_indices = []
    for i, e in enumerate(entries):
        if _entry_needs_review(e):
            reviewable.append((i, e))
        else:
            skipped_indices.append(i)

    total_to_review = len(reviewable)

    yield {
        "type": "meta",
        "total_entries": len(entries),
        "to_review": total_to_review,
        "skipped": len(skipped_indices),
        "model": model,
        "batch_size": batch_size,
    }

    all_changes = []
    all_corrected = [dict(e) for e in entries]
    total_duration_ms = 0
    reviewed_count = 0

    for batch_start in range(0, total_to_review, batch_size):
        batch_items = reviewable[batch_start:batch_start + batch_size]
        batch_entries = [e for _, e in batch_items]

        table_lines = [
            {"row": e.get("row_index", 0), "en": e.get("english", ""), "de": e.get("german", ""), "ex": e.get("example", "")}
            for e in batch_entries
        ]

        prompt = _build_llm_prompt(table_lines)

        logger.info("LLM review streaming: batch %d -- sending %d entries to %s",
                    batch_start // batch_size, len(batch_entries), model)

        t0 = time.time()
        async with httpx.AsyncClient(timeout=300.0) as client:
            resp = await client.post(
                f"{_OLLAMA_URL}/api/chat",
                json={
                    "model": model,
                    "messages": [{"role": "user", "content": prompt}],
                    "stream": False,
                    "think": False,
                    "options": {"temperature": 0.1, "num_predict": 8192},
                },
            )
            resp.raise_for_status()
            content = resp.json().get("message", {}).get("content", "")
        batch_ms = int((time.time() - t0) * 1000)
        total_duration_ms += batch_ms

        corrected = _parse_llm_json_array(content)
        batch_changes, batch_corrected = _diff_batch(batch_entries, corrected)

        for batch_idx, (orig_idx, _) in enumerate(batch_items):
            if batch_idx < len(batch_corrected):
                all_corrected[orig_idx] = batch_corrected[batch_idx]

        all_changes.extend(batch_changes)
        reviewed_count += len(batch_items)

        yield {
            "type": "batch",
            "batch_index": batch_start // batch_size,
            "entries_reviewed": [e.get("row_index", 0) for _, e in batch_items],
            "changes": batch_changes,
            "duration_ms": batch_ms,
            "progress": {"current": reviewed_count, "total": total_to_review},
        }

    yield {
        "type": "complete",
        "changes": all_changes,
        "model_used": model,
        "duration_ms": total_duration_ms,
        "total_entries": len(entries),
        "reviewed": total_to_review,
        "skipped": len(skipped_indices),
        "corrections_found": len(all_changes),
        "entries_corrected": all_corrected,
    }