breakpilot-lehrer/klausur-service/backend/ocr/gutter/core.py

"""
Gutter Repair Core — spellchecker setup, data types, and single-word repair logic.

Extracted from cv_gutter_repair.py for modularity.

Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""

import itertools
import logging
import re
import uuid
from dataclasses import dataclass, field, asdict
from typing import Any, Dict, List, Optional, Tuple

logger = logging.getLogger(__name__)

# ---------------------------------------------------------------------------
# Spellchecker setup (lazy, cached)
# ---------------------------------------------------------------------------

_spell_de = None
_spell_en = None
_SPELL_AVAILABLE = False

def _init_spellcheckers():
    """Lazy-load DE + EN spellcheckers (cached across calls)."""
    global _spell_de, _spell_en, _SPELL_AVAILABLE
    if _spell_de is not None:
        return
    try:
        from spellchecker import SpellChecker
        _spell_de = SpellChecker(language='de', distance=1)
        _spell_en = SpellChecker(language='en', distance=1)
        _SPELL_AVAILABLE = True
        logger.info("Gutter repair: spellcheckers loaded (DE + EN)")
    except ImportError:
        logger.warning("pyspellchecker not installed — gutter repair unavailable")


def _is_known(word: str) -> bool:
    """Check if a word is known in DE or EN dictionary."""
    _init_spellcheckers()
    if not _SPELL_AVAILABLE:
        return False
    w = word.lower()
    return bool(_spell_de.known([w])) or bool(_spell_en.known([w]))


def _spell_candidates(word: str, lang: str = "both") -> List[str]:
    """Get all plausible spellchecker candidates for a word (deduplicated)."""
    _init_spellcheckers()
    if not _SPELL_AVAILABLE:
        return []
    w = word.lower()
    seen: set = set()
    results: List[str] = []

    for checker in ([_spell_de, _spell_en] if lang == "both"
                    else [_spell_de] if lang == "de"
                    else [_spell_en]):
        if checker is None:
            continue
        cands = checker.candidates(w)
        if cands:
            for c in cands:
                if c and c != w and c not in seen:
                    seen.add(c)
                    results.append(c)

    return results


# ---------------------------------------------------------------------------
# Gutter position detection
# ---------------------------------------------------------------------------

# Minimum word length for spell-fix (very short words are often legitimate)
_MIN_WORD_LEN_SPELL = 3

# Minimum word length for hyphen-join candidates (fragments at the gutter
# can be as short as 1-2 chars, e.g. "ve" from "ver-künden")
_MIN_WORD_LEN_HYPHEN = 2

# How close to the right column edge a word must be to count as "gutter-adjacent".
# Expressed as fraction of column width (e.g. 0.75 = rightmost 25%).
_GUTTER_EDGE_THRESHOLD = 0.70

# Small common words / abbreviations that should NOT be repaired
_STOPWORDS = frozenset([
    # German
    "ab", "an", "am", "da", "er", "es", "im", "in", "ja", "ob", "so", "um",
    "zu", "wo", "du", "eh", "ei", "je", "na", "nu", "oh",
    # English
    "a", "am", "an", "as", "at", "be", "by", "do", "go", "he", "if", "in",
    "is", "it", "me", "my", "no", "of", "on", "or", "so", "to", "up", "us",
    "we",
])

# IPA / phonetic patterns — skip these cells
_IPA_RE = re.compile(r'[\[\]/ˈˌːʃʒθðŋɑɒæɔəɛɪʊʌ]')


def _is_ipa_text(text: str) -> bool:
    """True if text looks like IPA transcription."""
    return bool(_IPA_RE.search(text))


def _word_is_at_gutter_edge(word_bbox: Dict, col_x: float, col_width: float) -> bool:
    """Check if a word's right edge is near the right boundary of its column."""
    if col_width <= 0:
        return False
    word_right = word_bbox.get("left", 0) + word_bbox.get("width", 0)
    col_right = col_x + col_width
    # Word's right edge within the rightmost portion of the column
    relative_pos = (word_right - col_x) / col_width
    return relative_pos >= _GUTTER_EDGE_THRESHOLD


# ---------------------------------------------------------------------------
# Suggestion types
# ---------------------------------------------------------------------------

@dataclass
class GutterSuggestion:
    """A single correction suggestion."""
    id: str = field(default_factory=lambda: str(uuid.uuid4())[:8])
    type: str = ""             # "hyphen_join" | "spell_fix"
    zone_index: int = 0
    row_index: int = 0
    col_index: int = 0
    col_type: str = ""
    cell_id: str = ""
    original_text: str = ""
    suggested_text: str = ""
    # For hyphen_join:
    next_row_index: int = -1
    next_row_cell_id: str = ""
    next_row_text: str = ""
    missing_chars: str = ""
    display_parts: List[str] = field(default_factory=list)
    # Alternatives (other plausible corrections the user can pick from)
    alternatives: List[str] = field(default_factory=list)
    # Meta:
    confidence: float = 0.0
    reason: str = ""           # "gutter_truncation" | "gutter_blur" | "hyphen_continuation"

    def to_dict(self) -> Dict[str, Any]:
        return asdict(self)


# ---------------------------------------------------------------------------
# Core repair logic
# ---------------------------------------------------------------------------

_TRAILING_PUNCT_RE = re.compile(r'[.,;:!?\)\]]+$')


def _try_hyphen_join(
    word_text: str,
    next_word_text: str,
    max_missing: int = 3,
) -> Optional[Tuple[str, str, float]]:
    """Try joining two fragments with 0..max_missing interpolated chars.

    Strips trailing punctuation from the continuation word before testing
    (e.g. "künden," → "künden") so dictionary lookup succeeds.

    Returns (joined_word, missing_chars, confidence) or None.
    """
    base = word_text.rstrip("-").rstrip()
    # Strip trailing punctuation from continuation (commas, periods, etc.)
    raw_continuation = next_word_text.lstrip()
    continuation = _TRAILING_PUNCT_RE.sub('', raw_continuation)

    if not base or not continuation:
        return None

    # 1. Direct join (no missing chars)
    direct = base + continuation
    if _is_known(direct):
        return (direct, "", 0.95)

    # 2. Try with 1..max_missing missing characters
    # Use common letters, weighted by frequency in German/English
    _COMMON_CHARS = "enristaldhgcmobwfkzpvjyxqu"

    for n_missing in range(1, max_missing + 1):
        for chars in itertools.product(_COMMON_CHARS[:15], repeat=n_missing):
            candidate = base + "".join(chars) + continuation
            if _is_known(candidate):
                missing = "".join(chars)
                # Confidence decreases with more missing chars
                conf = 0.90 - (n_missing - 1) * 0.10
                return (candidate, missing, conf)

    return None


def _try_spell_fix(
    word_text: str, col_type: str = "",
) -> Optional[Tuple[str, float, List[str]]]:
    """Try to fix a single garbled gutter word via spellchecker.

    Returns (best_correction, confidence, alternatives_list) or None.
    The alternatives list contains other plausible corrections the user
    can choose from (e.g. "stammelt" vs "stammeln").
    """
    if len(word_text) < _MIN_WORD_LEN_SPELL:
        return None

    # Strip trailing/leading parentheses and check if the bare word is valid.
    # Words like "probieren)" or "(Englisch" are valid words with punctuation,
    # not OCR errors. Don't suggest corrections for them.
    stripped = word_text.strip("()")
    if stripped and _is_known(stripped):
        return None

    # Determine language priority from column type
    if "en" in col_type:
        lang = "en"
    elif "de" in col_type:
        lang = "de"
    else:
        lang = "both"

    candidates = _spell_candidates(word_text, lang=lang)
    if not candidates and lang != "both":
        candidates = _spell_candidates(word_text, lang="both")

    if not candidates:
        return None

    # Preserve original casing
    is_upper = word_text[0].isupper()

    def _preserve_case(w: str) -> str:
        if is_upper and w:
            return w[0].upper() + w[1:]
        return w

    # Sort candidates by edit distance (closest first)
    scored = []
    for c in candidates:
        dist = _edit_distance(word_text.lower(), c.lower())
        scored.append((dist, c))
    scored.sort(key=lambda x: x[0])

    best_dist, best = scored[0]
    best = _preserve_case(best)
    conf = max(0.5, 1.0 - best_dist * 0.15)

    # Build alternatives (all other candidates, also case-preserved)
    alts = [_preserve_case(c) for _, c in scored[1:] if c.lower() != best.lower()]
    # Limit to top 5 alternatives
    alts = alts[:5]

    return (best, conf, alts)


def _edit_distance(a: str, b: str) -> int:
    """Simple Levenshtein distance."""
    if len(a) < len(b):
        return _edit_distance(b, a)
    if len(b) == 0:
        return len(a)
    prev = list(range(len(b) + 1))
    for i, ca in enumerate(a):
        curr = [i + 1]
        for j, cb in enumerate(b):
            cost = 0 if ca == cb else 1
            curr.append(min(curr[j] + 1, prev[j + 1] + 1, prev[j] + cost))
        prev = curr
    return prev[len(b)]